1 import sys, os
2 sys.path.append(os.path.dirname(os.path.abspath(__file__))+"/..")
3 from Core.IdSet import IdSet
4 from operator import itemgetter
5 try:
6 import numpy
7 numpy.array([])
8 numpyAvailable = True
9 except:
10 numpyAvailable = False
11
12 -def writeModel(svs, modelfile, newfile, tokenized=False):
13 f = open(modelfile,"rt")
14 lines = f.readlines()
15 f.close()
16
17 if tokenized:
18 fline = "1\nqid:0"
19 else:
20 fline = "1 qid:0"
21 index = 1
22 for sv in svs:
23 for feature in sv:
24 if feature != 0:
25 if tokenized:
26 fline += "\n"
27 else:
28 fline += " "
29 fline += str(index) + ":" + feature
30 index += 1
31 if tokenized:
32 lines[-1] = fline + "\n#\n"
33 else:
34 lines[-1] = fline + " #"
35
36 f = open(newfile, "wt")
37 for line in lines:
38 f.write(line)
39 f.close()
40
42 f = open(modelfile,"rt")
43 lines = f.readlines()
44 f.close()
45 lastLine = lines[-1]
46 lines = lines[:-1]
47
48 f = open(newfile, "wt")
49 for line in lines:
50 f.write(line)
51 for token in lastLine.split():
52 f.write(token + "\n")
53 f.close()
54
56 f = open(modelfile,"rt")
57 for line in f:
58 if line.find("number of classes") != -1:
59 numClasses = int(line.split("#")[0])
60 elif line.find("number of base features") != -1:
61 numFeatures = int(line.split("#")[0])
62 elif line.find("highest feature index") != -1:
63 highestIndex = int(line.split("#")[0])
64 f.close()
65 return numClasses, numFeatures, highestIndex
66
68 numClasses, numFeatures, highestIndex = parseModel(modelfile)
69
70
71 f = open(modelfile,"rt")
72 line = f.readlines()[-1]
73 f.close()
74 line = line.rsplit("#",1)[0]
75 tokens = line.split()
76 assert tokens[1].find("qid") != -1
77 tokens = tokens[2:]
78 numFeaturesPerClass = highestIndex / numClasses
79
80 svs = [[]]
81 svIndex = 0
82 num = 0
83 for token in tokens:
84 newNum, value = token.split(":")
85 newNum = int(newNum)
86 if valueToFloat:
87 value = float(value)
88 assert newNum > num
89 while newNum - num > 1:
90 svs[-1].append(0)
91 if len(svs[-1]) == numFeaturesPerClass:
92 svs.append([])
93 num += 1
94 svs[-1].append(value)
95 if len(svs[-1]) == numFeaturesPerClass:
96 svs.append([])
97 num = newNum
98 while num < highestIndex - 1:
99 svs[-1].append(0)
100 num += 1
101
102 if numpyAvailable:
103 for i in range(len(svs)):
104 svs[i] = numpy.array(svs[i])
105
106
107 return svs
108
110 print line[-500:]
111 line = line.rsplit("#",1)[0]
112 print line[-500:]
113 tokens = line.split()
114 assert tokens[1].find("qid") != -1
115 tokens = tokens[2:]
116 num = 0
117 realTokens = 0
118 for token in tokens:
119 newNum = int(token.split(":")[0])
120
121 if newNum - num > 1:
122 pass
123
124 realTokens += newNum - num
125 num = newNum
126 print "r", realTokens, realTokens/numClasses
127 if numFeatures != -1:
128 print "Classes", numClasses
129 print "Features:", numFeatures
130 print "Tokens:", len(tokens), "Classes*features:", numClasses * numFeatures
131 assert len(tokens) == numClasses * numFeatures
132
133 -def mapIds(featureIds, modelFile):
143
145 numFeatures = len(svs[0])
146
147 weights = [0]
148 for i in range(numFeatures):
149 for sv in svs:
150
151 assert len(sv) == numFeatures, (len(sv), numFeatures)
152 absFeature = abs(sv[i])
153 if absFeature > weights[-1]:
154 weights[-1] = absFeature
155 weights.append(0)
156 return weights
157
159 numFeatures = len(svs[0])
160
161 weights = [0]
162 for i in range(numFeatures):
163 for sv in svs:
164
165 assert len(sv) == numFeatures
166 absFeature = abs(sv[i])
167 if absFeature > weights[-1]:
168 weights[-1] = absFeature
169 weights.append(0)
170 return weights
171
173 tuples = []
174 for i in range(len(weights)):
175 tuples.append( (weights[i], featureIds.getName(i+1)) )
176 tuples.sort(key=itemgetter(0))
177 return tuples
178
180 dict = {}
181 for pair in weights:
182 tokens = pair[1].split("_")
183 for token in tokens:
184 if not dict.has_key(token):
185 dict[token] = 0
186 if pair[0] > dict[token]:
187 dict[token] = pair[0]
188 return dict
189
190 if __name__=="__main__":
191
192 try:
193 import psyco
194 psyco.full()
195 print >> sys.stderr, "Found Psyco, using"
196 except ImportError:
197 print >> sys.stderr, "Psyco not installed"
198
199
200
201 from optparse import OptionParser
202 optparser = OptionParser(description="Joachims SVM Multiclass model file processing")
203 optparser.add_option("-i", "--ids", default=None, dest="ids", help="SVM feature ids")
204 optparser.add_option("-m", "--model", default=None, dest="model", help="SVM model file")
205 optparser.add_option("-o", "--output", default=None, dest="output", help="Output file stem")
206 (options, args) = optparser.parse_args()
207
208
209
210
211
212
213
214
215 s = getSupportVectors(options.model)
216 print "vectors:", len(s)
217 s = s[0:-1]
218
219
220 w = getWeights(s)
221 w = assignNames(w, IdSet(filename=options.ids))
222 f = open(options.output + "weights.txt", "wt")
223 for pair in w:
224 f.write(str(pair[0]) + "\t" + str(pair[1]) + "\n")
225 f.close()
226
227 d = getTokenWeights(w)
228 f = open(options.output + "weights-tokens.txt", "wt")
229 for pair in sorted(d.items(), key=itemgetter(1)):
230 f.write(str(pair[1]) + "\t" + str(pair[0]) + "\n")
231 f.close()
232