1  import sys, os 
  2  sys.path.append(os.path.dirname(os.path.abspath(__file__))+"/..") 
  3  from Core.IdSet import IdSet 
  4  from operator import itemgetter 
  5  try: 
  6      import numpy 
  7      numpy.array([]) 
  8      numpyAvailable = True 
  9  except: 
 10      numpyAvailable = False 
 11   
 12 -def writeModel(svs, modelfile, newfile, tokenized=False): 
  13      f = open(modelfile,"rt") 
 14      lines = f.readlines() 
 15      f.close() 
 16       
 17      if tokenized: 
 18          fline = "1\nqid:0" 
 19      else: 
 20          fline = "1 qid:0" 
 21      index = 1 
 22      for sv in svs: 
 23          for feature in sv: 
 24              if feature != 0: 
 25                  if tokenized: 
 26                      fline += "\n" 
 27                  else: 
 28                      fline += " " 
 29                  fline += str(index) + ":" + feature 
 30              index += 1 
 31      if tokenized: 
 32          lines[-1] = fline + "\n#\n" 
 33      else: 
 34          lines[-1] = fline + " #" 
 35       
 36      f = open(newfile, "wt") 
 37      for line in lines: 
 38          f.write(line) 
 39      f.close() 
  40   
 42      f = open(modelfile,"rt") 
 43      lines = f.readlines() 
 44      f.close() 
 45      lastLine = lines[-1] 
 46      lines = lines[:-1] 
 47           
 48      f = open(newfile, "wt") 
 49      for line in lines: 
 50          f.write(line) 
 51      for token in lastLine.split(): 
 52          f.write(token + "\n") 
 53      f.close() 
  54   
 56      f = open(modelfile,"rt") 
 57      for line in f: 
 58          if line.find("number of classes") != -1: 
 59              numClasses = int(line.split("#")[0]) 
 60          elif line.find("number of base features") != -1: 
 61              numFeatures = int(line.split("#")[0]) 
 62          elif line.find("highest feature index") != -1: 
 63              highestIndex = int(line.split("#")[0]) 
 64      f.close() 
 65      return numClasses, numFeatures, highestIndex 
  66   
 68      numClasses, numFeatures, highestIndex = parseModel(modelfile) 
 69       
 70       
 71      f = open(modelfile,"rt") 
 72      line = f.readlines()[-1] 
 73      f.close() 
 74      line = line.rsplit("#",1)[0] 
 75      tokens = line.split() 
 76      assert tokens[1].find("qid") != -1 
 77      tokens = tokens[2:] 
 78      numFeaturesPerClass = highestIndex / numClasses 
 79       
 80      svs = [[]] 
 81      svIndex = 0 
 82      num = 0 
 83      for token in tokens: 
 84          newNum, value = token.split(":") 
 85          newNum = int(newNum) 
 86          if valueToFloat: 
 87              value = float(value) 
 88          assert newNum > num 
 89          while newNum - num > 1: 
 90              svs[-1].append(0) 
 91              if len(svs[-1]) == numFeaturesPerClass: 
 92                  svs.append([]) 
 93              num += 1 
 94          svs[-1].append(value) 
 95          if len(svs[-1]) == numFeaturesPerClass: 
 96              svs.append([]) 
 97          num = newNum 
 98      while num < highestIndex - 1: 
 99          svs[-1].append(0) 
100          num += 1 
101       
102      if numpyAvailable: 
103          for i in range(len(svs)): 
104              svs[i] = numpy.array(svs[i]) 
105       
106       
107      return svs 
 108   
110      print line[-500:] 
111      line = line.rsplit("#",1)[0] 
112      print line[-500:] 
113      tokens = line.split() 
114      assert tokens[1].find("qid") != -1 
115      tokens = tokens[2:] 
116      num = 0 
117      realTokens = 0 
118      for token in tokens: 
119          newNum = int(token.split(":")[0]) 
120           
121          if newNum - num > 1: 
122              pass 
123               
124          realTokens += newNum - num 
125          num = newNum 
126      print "r", realTokens, realTokens/numClasses 
127      if numFeatures != -1: 
128          print "Classes", numClasses 
129          print "Features:", numFeatures 
130          print "Tokens:", len(tokens), "Classes*features:", numClasses * numFeatures 
131          assert len(tokens) == numClasses * numFeatures 
 132   
133 -def mapIds(featureIds, modelFile): 
 143   
145      numFeatures = len(svs[0]) 
146       
147      weights = [0] 
148      for i in range(numFeatures): 
149          for sv in svs: 
150               
151              assert len(sv) == numFeatures, (len(sv), numFeatures) 
152              absFeature = abs(sv[i]) 
153              if absFeature > weights[-1]: 
154                  weights[-1] = absFeature 
155          weights.append(0) 
156      return weights 
 157   
159      numFeatures = len(svs[0]) 
160       
161      weights = [0] 
162      for i in range(numFeatures): 
163          for sv in svs: 
164               
165              assert len(sv) == numFeatures 
166              absFeature = abs(sv[i]) 
167              if absFeature > weights[-1]: 
168                  weights[-1] = absFeature 
169          weights.append(0) 
170      return weights 
 171   
173      tuples = [] 
174      for i in range(len(weights)): 
175          tuples.append( (weights[i], featureIds.getName(i+1)) ) 
176      tuples.sort(key=itemgetter(0)) 
177      return tuples 
 178   
180      dict = {} 
181      for pair in weights: 
182          tokens = pair[1].split("_") 
183          for token in tokens: 
184              if not dict.has_key(token): 
185                  dict[token] = 0 
186              if pair[0] > dict[token]: 
187                  dict[token] = pair[0] 
188      return dict 
 189   
190  if __name__=="__main__": 
191       
192      try: 
193          import psyco 
194          psyco.full() 
195          print >> sys.stderr, "Found Psyco, using" 
196      except ImportError: 
197          print >> sys.stderr, "Psyco not installed" 
198       
199       
200       
201      from optparse import OptionParser  
202      optparser = OptionParser(description="Joachims SVM Multiclass model file processing") 
203      optparser.add_option("-i", "--ids", default=None, dest="ids", help="SVM feature ids") 
204      optparser.add_option("-m", "--model", default=None, dest="model", help="SVM model file") 
205      optparser.add_option("-o", "--output", default=None, dest="output", help="Output file stem") 
206      (options, args) = optparser.parse_args() 
207   
208       
209       
210       
211       
212   
213       
214       
215      s = getSupportVectors(options.model) 
216      print "vectors:", len(s) 
217      s = s[0:-1] 
218       
219       
220      w = getWeights(s) 
221      w = assignNames(w, IdSet(filename=options.ids)) 
222      f = open(options.output + "weights.txt", "wt") 
223      for pair in w: 
224          f.write(str(pair[0]) + "\t" + str(pair[1]) + "\n") 
225      f.close() 
226       
227      d = getTokenWeights(w) 
228      f = open(options.output + "weights-tokens.txt", "wt") 
229      for pair in sorted(d.items(), key=itemgetter(1)): 
230          f.write(str(pair[1]) + "\t" + str(pair[0]) + "\n") 
231      f.close() 
232