Package TEES :: Package Core :: Module IdSet
[hide private]

Source Code for Module TEES.Core.IdSet

  1  """ 
  2  Manages classification class and feature ids. 
  3  """ 
  4  __version__ = "$Revision: 1.20 $" 
  5   
  6  import codecs 
  7  import gzip 
  8   
9 -class IdSet:
10 """ 11 A mapping from strings to id integers. This class is used for defining the ids for classes 12 and features of machine learning systems. 13 """
14 - def __init__(self, firstNumber=1, idDict=None, locked=False, filename=None, allowNewIds=True):
15 """ 16 Creates a new IdSet or loads one from a dictionary or a file. 17 18 To create a new, empty set: idset = IdSet(firstNumber = x). 19 To create a set from a str->int dictionary: idset = IdSet(idDict = x). 20 To load a dictionary from a file: idset = IdSet(filename = x). 21 22 @param firstNumber: The number given to the first name defined. Subsequent names will 23 have higher numbers. 24 @type firstNumber: int 25 @param idDict: Dictionary of name / integer pairs. The integer values must be unique. 26 @type idDict: dictionary 27 @param locked: Whether new names can be added to the set. If set to True, getId will 28 return None for names that are not already in the set. 29 @type locked: boolean 30 @param filename: load name/id pairs from a file 31 @type filename: str 32 """ 33 self.Ids = {} 34 self.nextFreeId = firstNumber 35 self._namesById = {} 36 self.allowNewIds = allowNewIds # allow new ids when calling getId without specifying "createIfNotExist" 37 38 if idDict != None: 39 self.locked = False 40 self.nextFreeId = 999999999 41 for name,id in idDict.iteritems(): 42 self.defineId(name, id) 43 self.nextFreeId = max(self.Ids.values())+1 44 self.locked = locked 45 46 if filename != None: 47 self.load(filename)
48
49 - def getId(self, key, createIfNotExist=None):
50 """ 51 Returns the id number for a name. If the name doesn't already have an id, a new id is defined, 52 unless createIfNotExist is set to false, in which case None is returned for these cases. 53 54 @type key: str 55 @param key: name 56 @type createIfNotExist: True, False or None 57 @param createIfNotExist: If the name doesn't have an id, define an id for it 58 @rtype: int or None 59 @return: an identifier 60 """ 61 if createIfNotExist == None: # no local override to object level setting 62 createIfNotExist = self.allowNewIds 63 if not self.Ids.has_key(key): 64 if self.locked or createIfNotExist == False: 65 return None 66 id = self.nextFreeId 67 self.nextFreeId += 1 68 #assert(not id in self.Ids.values()) 69 self.Ids[key] = id 70 self._namesById[id] = key 71 return self.Ids[key]
72
73 - def __getitem__( self, name ):
74 """ 75 Calls getId through the []-operator. 76 """ 77 return getId(name)
78
79 - def defineId(self, name, id):
80 """ 81 Give a specific id for a certain name. Neither the name nor the id must exist in the set 82 and the id must be smaller than the largest id already in the set. Usually this method 83 is used only when inserting name/id pairs from an existing source. 84 """ 85 assert(not self.locked) 86 assert(not id in self.Ids.values()) 87 assert(not name in self.Ids.keys()) 88 assert(id < self.nextFreeId) 89 self.Ids[name] = id 90 self._namesById[id] = name
91
92 - def getName(self, id):
93 """ 94 Returns the name corresponding to the identifier. If the identifier doesn't exits, returns None. 95 96 @param id: the identifier number 97 @type id: int 98 @rtype: str or None 99 @return: a name 100 """ 101 if self._namesById.has_key(id): 102 return self._namesById[id] 103 else: 104 return None
105
106 - def getNames(self):
107 """ 108 Returns a sorted list of all names. Can be slow for large IdSets. 109 """ 110 names = self.Ids.keys() 111 names.sort() 112 return names
113
114 - def getIds(self):
115 """ 116 Returns a sorted list of id numbers. Can be slow for large IdSets. 117 """ 118 values = self.Ids.values() 119 values.sort() 120 return values
121
122 - def write(self, filename):
123 """ 124 Writes the name/id pairs to a file, one pair per line, in the format "name: id". 125 """ 126 #f = codecs.open(filename, "wt", "utf-8") 127 if filename.endswith(".gz"): 128 f = gzip.open(filename, 'wt') 129 writer = codecs.getwriter("utf-8")(f) 130 else: 131 writer = codecs.open(filename, "wt", "utf-8") 132 f = writer 133 134 keys = self.Ids.keys() 135 keys.sort() 136 for key in keys: 137 # key is assumed to be a string 138 writer.write( key + ": " + str(self.Ids[key]) + "\n" ) 139 #f.write( (str(key)+": "+str(self.Ids[key])+"\n") ) # this causes unicode problems 140 #f.write( (str(key)+": "+str(self.Ids[key])+"\n") ) 141 #f.write( (str(key)+": "+str(self.Ids[key])+"\n").encode("utf-8") ) 142 f.close()
143
144 - def load(self, filename):
145 """ 146 Loads name/id pairs from a file. The IdSet is cleared of all existing ids before 147 loading the ones from the file. 148 """ 149 self.Ids = {} 150 self._namesById = {} 151 self.nextFreeId = -999999999999999999 152 153 #f = codecs.open(filename, "rt", "utf-8") 154 if filename.endswith(".gz"): 155 f = gzip.open(filename, 'rt') 156 reader = codecs.getreader("utf-8")(f) 157 else: 158 reader = codecs.open(filename, "rt", "utf-8") 159 f = reader 160 lines = reader.readlines() 161 f.close() 162 163 for line in lines: 164 key, value = line.rsplit(":",1) 165 key = key.strip() 166 value = int(value.strip()) 167 if value >= self.nextFreeId: 168 self.nextFreeId = value + 1 169 self.Ids[key] = value 170 self._namesById[value] = key
171