1 """
2 Manages classification class and feature ids.
3 """
4 __version__ = "$Revision: 1.20 $"
5
6 import codecs
7 import gzip
8
10 """
11 A mapping from strings to id integers. This class is used for defining the ids for classes
12 and features of machine learning systems.
13 """
14 - def __init__(self, firstNumber=1, idDict=None, locked=False, filename=None, allowNewIds=True):
15 """
16 Creates a new IdSet or loads one from a dictionary or a file.
17
18 To create a new, empty set: idset = IdSet(firstNumber = x).
19 To create a set from a str->int dictionary: idset = IdSet(idDict = x).
20 To load a dictionary from a file: idset = IdSet(filename = x).
21
22 @param firstNumber: The number given to the first name defined. Subsequent names will
23 have higher numbers.
24 @type firstNumber: int
25 @param idDict: Dictionary of name / integer pairs. The integer values must be unique.
26 @type idDict: dictionary
27 @param locked: Whether new names can be added to the set. If set to True, getId will
28 return None for names that are not already in the set.
29 @type locked: boolean
30 @param filename: load name/id pairs from a file
31 @type filename: str
32 """
33 self.Ids = {}
34 self.nextFreeId = firstNumber
35 self._namesById = {}
36 self.allowNewIds = allowNewIds
37
38 if idDict != None:
39 self.locked = False
40 self.nextFreeId = 999999999
41 for name,id in idDict.iteritems():
42 self.defineId(name, id)
43 self.nextFreeId = max(self.Ids.values())+1
44 self.locked = locked
45
46 if filename != None:
47 self.load(filename)
48
49 - def getId(self, key, createIfNotExist=None):
50 """
51 Returns the id number for a name. If the name doesn't already have an id, a new id is defined,
52 unless createIfNotExist is set to false, in which case None is returned for these cases.
53
54 @type key: str
55 @param key: name
56 @type createIfNotExist: True, False or None
57 @param createIfNotExist: If the name doesn't have an id, define an id for it
58 @rtype: int or None
59 @return: an identifier
60 """
61 if createIfNotExist == None:
62 createIfNotExist = self.allowNewIds
63 if not self.Ids.has_key(key):
64 if self.locked or createIfNotExist == False:
65 return None
66 id = self.nextFreeId
67 self.nextFreeId += 1
68
69 self.Ids[key] = id
70 self._namesById[id] = key
71 return self.Ids[key]
72
74 """
75 Calls getId through the []-operator.
76 """
77 return getId(name)
78
80 """
81 Give a specific id for a certain name. Neither the name nor the id must exist in the set
82 and the id must be smaller than the largest id already in the set. Usually this method
83 is used only when inserting name/id pairs from an existing source.
84 """
85 assert(not self.locked)
86 assert(not id in self.Ids.values())
87 assert(not name in self.Ids.keys())
88 assert(id < self.nextFreeId)
89 self.Ids[name] = id
90 self._namesById[id] = name
91
93 """
94 Returns the name corresponding to the identifier. If the identifier doesn't exits, returns None.
95
96 @param id: the identifier number
97 @type id: int
98 @rtype: str or None
99 @return: a name
100 """
101 if self._namesById.has_key(id):
102 return self._namesById[id]
103 else:
104 return None
105
107 """
108 Returns a sorted list of all names. Can be slow for large IdSets.
109 """
110 names = self.Ids.keys()
111 names.sort()
112 return names
113
115 """
116 Returns a sorted list of id numbers. Can be slow for large IdSets.
117 """
118 values = self.Ids.values()
119 values.sort()
120 return values
121
122 - def write(self, filename):
123 """
124 Writes the name/id pairs to a file, one pair per line, in the format "name: id".
125 """
126
127 if filename.endswith(".gz"):
128 f = gzip.open(filename, 'wt')
129 writer = codecs.getwriter("utf-8")(f)
130 else:
131 writer = codecs.open(filename, "wt", "utf-8")
132 f = writer
133
134 keys = self.Ids.keys()
135 keys.sort()
136 for key in keys:
137
138 writer.write( key + ": " + str(self.Ids[key]) + "\n" )
139
140
141
142 f.close()
143
144 - def load(self, filename):
145 """
146 Loads name/id pairs from a file. The IdSet is cleared of all existing ids before
147 loading the ones from the file.
148 """
149 self.Ids = {}
150 self._namesById = {}
151 self.nextFreeId = -999999999999999999
152
153
154 if filename.endswith(".gz"):
155 f = gzip.open(filename, 'rt')
156 reader = codecs.getreader("utf-8")(f)
157 else:
158 reader = codecs.open(filename, "rt", "utf-8")
159 f = reader
160 lines = reader.readlines()
161 f.close()
162
163 for line in lines:
164 key, value = line.rsplit(":",1)
165 key = key.strip()
166 value = int(value.strip())
167 if value >= self.nextFreeId:
168 self.nextFreeId = value + 1
169 self.Ids[key] = value
170 self._namesById[value] = key
171