1 import sys, os
2 try:
3 import xml.etree.cElementTree as ET
4 except ImportError:
5 import cElementTree as ET
6 sys.path.append(os.path.dirname(os.path.abspath(__file__))+"/../..")
7 import Utils.ElementTreeUtils as ETUtils
8 import Utils.Range as Range
9 from collections import defaultdict
10 import ExampleBuilders.PhraseTriggerExampleBuilder
11
13 tokens = [""]
14 inText = False
15 for c in text:
16 if c.isspace():
17 if inText:
18 tokens.append(c)
19 inText = False
20 else:
21 tokens[-1] += c
22 else:
23 if inText:
24 tokens[-1] += c
25 else:
26 tokens.append(c)
27 inText = True
28 if tokens[0] == "" and len(tokens) > 1:
29 return tokens[1:]
30 else:
31 return tokens
32
34 if token[-1] == ".":
35 token = token[:-1]
36 if toLower:
37 token = token.lower()
38
39 if token in ["heliothrix", "caldicellulosiruptor"]:
40 return True
41
42 if token == "genus":
43 return True
44 if token == "bacterium":
45 return True
46 if token == "bacteria":
47 return True
48 elif token == "strain":
49 return True
50 elif token == "organisms":
51 return True
52 elif token == "fetus":
53 return True
54 elif token == "venerealis":
55 return True
56 elif token == "subsp":
57 return True
58 elif token == "subspecies":
59 return True
60 elif token == "ssp":
61 return True
62 elif token == "-like":
63 return True
64 elif token == "sp":
65 return True
66
67
68 elif token == "serotope":
69 return True
70 elif token == "psjn":
71 return True
72
73
74 return False
75
77 while len(token) > 0 and not token[0].isalnum():
78 token = token[1:]
79 if relPos > 0:
80 while len(token) > 0 and token[-1] == ")":
81 token = token[:-1]
82
83
84 if len(token) == 2 and token[0].isupper() and token[1] == ".":
85 return True
86
87 if len(token) == 4 and token[0].isupper() and token[-1] == "." and token[1:3].islower():
88 return True
89
90 if len(token) == 0: return False
91 if token[-1] == ".":
92 token = token[:-1]
93 if len(token) == 0: return False
94 if token[-1] == ",":
95 return False
96 if relPos < 0:
97 return False
98 else:
99 token = token[:-1]
100 if len(token) == 0: return False
101
102 tokenLower = token.lower()
103 if tokenLower in bacteriaTokens:
104 return True
105 for split in tokenLower.split("-"):
106 if split in bacteriaTokens:
107 return True
108 for split in tokenLower.split("/"):
109 if split in bacteriaTokens:
110 return True
111
112 if token == "JIP":
113 return True
114
115 if tokenLower.endswith("lla"):
116 return True
117 elif tokenLower.endswith("ica"):
118 return True
119 elif tokenLower.endswith("us") and tokenLower != "thus":
120 return True
121 elif tokenLower.endswith("um") and tokenLower not in ["phylum"]:
122 return True
123 elif tokenLower.endswith("ans") and tokenLower != "humans":
124 return True
125 elif tokenLower.endswith("bacter"):
126 return True
127 elif tokenLower.endswith("is") and tokenLower not in ["is", "this"]:
128 return True
129
130
131 elif tokenLower.endswith("ma"):
132 return True
133 elif tokenLower.endswith("ia"):
134 return True
135 elif tokenLower.endswith("ii"):
136 return True
137 elif tokenLower.endswith("li"):
138 return True
139 elif tokenLower.endswith("nii"):
140 return True
141 elif tokenLower.endswith("plasma"):
142 return True
143 elif tokenLower.endswith("plasmas"):
144 return True
145 elif tokenLower.endswith("ae"):
146 return True
147 elif tokenLower.endswith("ri"):
148 return True
149 elif tokenLower.endswith("ni"):
150 return True
151
152 if isExtraWord(token, toLower=True):
153 return True
154
155 isTrue = True
156 for c in token:
157 if c.isdigit() or c == "-" or c.isupper():
158 continue
159 else:
160 isTrue = False
161 break
162 if isTrue:
163 return True
164
165 return False
166
167 -def extend(input, output=None, entityTypes=["Bacterium"], verbose=False):
168 if not (ET.iselement(input) and input.tag == "sentence"):
169 print >> sys.stderr, "Loading corpus file", input
170 corpusTree = ETUtils.ETFromObj(input)
171 corpusRoot = corpusTree.getroot()
172
173 bacteriaTokens = ExampleBuilders.PhraseTriggerExampleBuilder.getBacteriaTokens()
174
175 if not (ET.iselement(input) and input.tag == "sentence"):
176 sentences = corpusRoot.getiterator("sentence")
177 else:
178 sentences = [input]
179 counts = defaultdict(int)
180 for sentence in sentences:
181 incorrectCount = 0
182 sentenceText = sentence.get("text")
183 tokens = tokenize(sentenceText)
184 for entity in sentence.findall("entity"):
185 counts["all-entities"] += 1
186 if entity.get("type") not in entityTypes:
187 continue
188 headOffset = entity.get("headOffset")
189 if headOffset == None:
190 if verbose: print "WARNING, no head offset for entity", entity.get("id")
191 headOffset = entity.get("charOffset")
192 headOffset = Range.charOffsetToTuples(headOffset)[0]
193 charOffset = entity.get("charOffset")
194 assert charOffset != None, "WARNING, no head offset for entity " + str(entity.get("id"))
195 charOffset = Range.charOffsetToTuples(charOffset)[0]
196 tokPos = [0,0]
197 tokIndex = None
198
199 for i in range(len(tokens)):
200 token = tokens[i]
201 tokPos[1] = tokPos[0] + len(token)
202 if Range.overlap(headOffset, tokPos):
203 tokIndex = i
204 break
205 tokPos[0] += len(token)
206 assert tokIndex != None, (entity.get("id"), entity.get("text"), tokens)
207 skip = False
208 if tokPos[0] < headOffset[0]:
209 tokPos = headOffset
210 skip = True
211 if not skip:
212
213 beginIndex = tokIndex
214 for i in range(tokIndex-1, -1, -1):
215 token = tokens[i]
216 if token.isspace():
217 continue
218 if not isBacteriaToken(token, bacteriaTokens, i - tokIndex):
219 beginIndex = i + 1
220 break
221 if i == 0:
222 beginIndex = i
223 while tokens[beginIndex].isspace() or isExtraWord(tokens[beginIndex], toLower=False):
224 beginIndex += 1
225 if beginIndex >= tokIndex:
226 beginIndex = tokIndex
227 break
228
229 endIndex = tokIndex
230 if tokens[tokIndex][-1] != ",":
231 endIndex = tokIndex
232 for i in range(tokIndex+1, len(tokens)):
233 token = tokens[i]
234 if token.isspace():
235 continue
236 if not isBacteriaToken(token, bacteriaTokens, i - tokIndex):
237 endIndex = i - 1
238 break
239 if i == len(tokens) - 1:
240 endIndex = i
241 while tokens[endIndex].isspace():
242 endIndex -= 1
243
244 if tokIndex > beginIndex:
245 for token in reversed(tokens[beginIndex:tokIndex]):
246 tokPos[0] -= len(token)
247 if tokIndex < endIndex:
248 for token in tokens[tokIndex+1:endIndex+1]:
249 tokPos[1] += len(token)
250
251 while not sentenceText[tokPos[1] - 1].isalnum():
252 tokPos[1] -= 1
253 if tokPos[1] < tokPos[0] + 1:
254 tokPos[1] = tokPos[0] + 1
255 break
256 while not sentenceText[tokPos[0]].isalnum():
257 tokPos[0] += 1
258 if tokPos[0] >= tokPos[1]:
259 tokPos[0] = tokPos[1] - 1
260 break
261
262
263
264
265
266
267 counts["entities"] += 1
268 newOffset = tuple(tokPos)
269 newOffsetString = Range.tuplesToCharOffset([newOffset])
270 if verbose:
271 print "Entity", entity.get("id"),
272
273 print [entity.get("text"), sentenceText[headOffset[0]:headOffset[1]], sentenceText[newOffset[0]:newOffset[1]]],
274 print [entity.get("charOffset"), entity.get("headOffset"), newOffsetString], "Sent:", len(sentence.get("text")),
275 if newOffset != headOffset:
276 counts["extended"] += 1
277 if verbose: print "EXTENDED",
278 if newOffset == charOffset:
279 counts["correct"] += 1
280 if verbose: print "CORRECT"
281 else:
282 counts["incorrect"] += 1
283 incorrectCount += 1
284 if verbose: print "INCORRECT"
285 entity.set("charOffset", newOffsetString)
286
287 entity.set("text", sentenceText[newOffset[0]:newOffset[1]])
288 if incorrectCount > 0 and verbose:
289 print "TOKENS:", "|".join(tokens)
290 print "--------------------------------"
291 if verbose:
292 print counts
293
294 if not (ET.iselement(input) and input.tag == "sentence"):
295 if output != None:
296 print >> sys.stderr, "Writing output to", output
297 ETUtils.write(corpusRoot, output)
298 return corpusTree
299
300 if __name__=="__main__":
301 print >> sys.stderr, "##### Extend Triggers #####"
302
303 try:
304 import psyco
305 psyco.full()
306 print >> sys.stderr, "Found Psyco, using"
307 except ImportError:
308 print >> sys.stderr, "Psyco not installed"
309
310 from optparse import OptionParser
311 optparser = OptionParser(usage="%prog [options]\nCreate an html visualization for a corpus.")
312 optparser.add_option("-i", "--input", default=None, dest="input", help="Corpus in analysis format", metavar="FILE")
313 optparser.add_option("-o", "--output", default=None, dest="output", help="Corpus in analysis format", metavar="FILE")
314 optparser.add_option("-d", "--debug", default=False, action="store_true", dest="debug", help="")
315 (options, args) = optparser.parse_args()
316 assert(options.input != None)
317
318
319 extend(options.input, options.output, verbose=options.debug)
320