TEES.Utils.InteractionXML.ExtendTriggers

1 import sys, os 2 try: 3 import xml.etree.cElementTree as ET 4 except ImportError: 5 import cElementTree as ET 6 sys.path.append(os.path.dirname(os.path.abspath(__file__))+"/../..") 7 import Utils.ElementTreeUtils as ETUtils 8 import Utils.Range as Range 9 from collections import defaultdict 10 import ExampleBuilders.PhraseTriggerExampleBuilder 11

12 -def tokenize(text):

13 tokens = [""] 14 inText = False 15 for c in text: 16 if c.isspace(): 17 if inText: 18 tokens.append(c) 19 inText = False 20 else: 21 tokens[-1] += c 22 else: # text 23 if inText: 24 tokens[-1] += c 25 else: 26 tokens.append(c) 27 inText = True 28 if tokens[0] == "" and len(tokens) > 1: 29 return tokens[1:] 30 else: 31 return tokens

32

33 -def isExtraWord(token, toLower=True, relPos = None):

34 if token[-1] == ".": 35 token = token[:-1] 36 if toLower: 37 token = token.lower() 38 39 if token in ["heliothrix", "caldicellulosiruptor"]: 40 return True 41 42 if token == "genus": 43 return True 44 if token == "bacterium": 45 return True 46 if token == "bacteria": 47 return True 48 elif token == "strain": 49 return True 50 elif token == "organisms": 51 return True 52 elif token == "fetus": 53 return True 54 elif token == "venerealis": 55 return True 56 elif token == "subsp": 57 return True 58 elif token == "subspecies": 59 return True 60 elif token == "ssp": 61 return True 62 elif token == "-like": 63 return True 64 elif token == "sp": 65 return True 66 #elif token == "species": 67 # return True 68 elif token == "serotope": 69 return True 70 elif token == "psjn": 71 return True 72 #elif token == "phylum": 73 # return True 74 return False

75

76 -def isBacteriaToken(token, bacteriaTokens, relPos):

77 while len(token) > 0 and not token[0].isalnum(): 78 token = token[1:] 79 if relPos > 0: 80 while len(token) > 0 and token[-1] == ")": 81 token = token[:-1] 82 83 # E., Y. etc. 84 if len(token) == 2 and token[0].isupper() and token[1] == ".": 85 return True 86 # Chl. ja Cfl. 87 if len(token) == 4 and token[0].isupper() and token[-1] == "." and token[1:3].islower(): 88 return True 89 90 if len(token) == 0: return False 91 if token[-1] == ".": 92 token = token[:-1] 93 if len(token) == 0: return False 94 if token[-1] == ",": 95 return False 96 if relPos < 0: # no commas before head 97 return False 98 else: 99 token = token[:-1] 100 if len(token) == 0: return False 101 102 tokenLower = token.lower() 103 if tokenLower in bacteriaTokens: 104 return True 105 for split in tokenLower.split("-"): 106 if split in bacteriaTokens: 107 return True 108 for split in tokenLower.split("/"): 109 if split in bacteriaTokens: 110 return True 111 112 if token == "JIP": 113 return True 114 115 if tokenLower.endswith("lla"): 116 return True 117 elif tokenLower.endswith("ica"): 118 return True 119 elif tokenLower.endswith("us") and tokenLower != "thus": 120 return True 121 elif tokenLower.endswith("um") and tokenLower not in ["phylum"]: 122 return True 123 elif tokenLower.endswith("ans") and tokenLower != "humans": 124 return True 125 elif tokenLower.endswith("bacter"): 126 return True 127 elif tokenLower.endswith("is") and tokenLower not in ["is", "this"]: 128 return True 129 #elif tokenLower.endswith("es"): 130 # return True 131 elif tokenLower.endswith("ma"): 132 return True 133 elif tokenLower.endswith("ia"): 134 return True 135 elif tokenLower.endswith("ii"): 136 return True 137 elif tokenLower.endswith("li"): 138 return True 139 elif tokenLower.endswith("nii"): 140 return True 141 elif tokenLower.endswith("plasma"): 142 return True 143 elif tokenLower.endswith("plasmas"): 144 return True 145 elif tokenLower.endswith("ae"): 146 return True 147 elif tokenLower.endswith("ri"): 148 return True 149 elif tokenLower.endswith("ni"): 150 return True 151 152 if isExtraWord(token, toLower=True): 153 return True 154 155 isTrue = True 156 for c in token: 157 if c.isdigit() or c == "-" or c.isupper(): 158 continue 159 else: 160 isTrue = False 161 break 162 if isTrue: 163 return True 164 165 return False

166

167 -def extend(input, output=None, entityTypes=["Bacterium"], verbose=False):

168 if not (ET.iselement(input) and input.tag == "sentence"): 169 print >> sys.stderr, "Loading corpus file", input 170 corpusTree = ETUtils.ETFromObj(input) 171 corpusRoot = corpusTree.getroot() 172 173 bacteriaTokens = ExampleBuilders.PhraseTriggerExampleBuilder.getBacteriaTokens() 174 175 if not (ET.iselement(input) and input.tag == "sentence"): 176 sentences = corpusRoot.getiterator("sentence") 177 else: 178 sentences = [input] 179 counts = defaultdict(int) 180 for sentence in sentences: 181 incorrectCount = 0 182 sentenceText = sentence.get("text") 183 tokens = tokenize(sentenceText) 184 for entity in sentence.findall("entity"): 185 counts["all-entities"] += 1 186 if entity.get("type") not in entityTypes: 187 continue 188 headOffset = entity.get("headOffset") 189 if headOffset == None: 190 if verbose: print "WARNING, no head offset for entity", entity.get("id") 191 headOffset = entity.get("charOffset") 192 headOffset = Range.charOffsetToTuples(headOffset)[0] 193 charOffset = entity.get("charOffset") 194 assert charOffset != None, "WARNING, no head offset for entity " + str(entity.get("id")) 195 charOffset = Range.charOffsetToTuples(charOffset)[0] 196 tokPos = [0,0] 197 tokIndex = None 198 # find main token 199 for i in range(len(tokens)): 200 token = tokens[i] 201 tokPos[1] = tokPos[0] + len(token) # - 1 202 if Range.overlap(headOffset, tokPos): 203 tokIndex = i 204 break 205 tokPos[0] += len(token) 206 assert tokIndex != None, (entity.get("id"), entity.get("text"), tokens) 207 skip = False 208 if tokPos[0] < headOffset[0]: 209 tokPos = headOffset 210 skip = True 211 if not skip: 212 # Extend before 213 beginIndex = tokIndex 214 for i in range(tokIndex-1, -1, -1): 215 token = tokens[i] 216 if token.isspace(): 217 continue 218 if not isBacteriaToken(token, bacteriaTokens, i - tokIndex): 219 beginIndex = i + 1 220 break 221 if i == 0: 222 beginIndex = i 223 while tokens[beginIndex].isspace() or isExtraWord(tokens[beginIndex], toLower=False): 224 beginIndex += 1 225 if beginIndex >= tokIndex: 226 beginIndex = tokIndex 227 break 228 # Extend after 229 endIndex = tokIndex 230 if tokens[tokIndex][-1] != ",": 231 endIndex = tokIndex 232 for i in range(tokIndex+1, len(tokens)): 233 token = tokens[i] 234 if token.isspace(): 235 continue 236 if not isBacteriaToken(token, bacteriaTokens, i - tokIndex): 237 endIndex = i - 1 238 break 239 if i == len(tokens) - 1: 240 endIndex = i 241 while tokens[endIndex].isspace(): 242 endIndex -= 1 243 # Modify range 244 if tokIndex > beginIndex: 245 for token in reversed(tokens[beginIndex:tokIndex]): 246 tokPos[0] -= len(token) 247 if tokIndex < endIndex: 248 for token in tokens[tokIndex+1:endIndex+1]: 249 tokPos[1] += len(token) 250 # Attempt to remove trailing periods and commas 251 while not sentenceText[tokPos[1] - 1].isalnum(): 252 tokPos[1] -= 1 253 if tokPos[1] < tokPos[0] + 1: 254 tokPos[1] = tokPos[0] + 1 255 break 256 while not sentenceText[tokPos[0]].isalnum(): 257 tokPos[0] += 1 258 if tokPos[0] >= tokPos[1]: 259 tokPos[0] = tokPos[1] - 1 260 break 261 # Split merged names 262 #newPos = [tokPos[0], tokPos[1]] 263 #for split in sentenceText[tokPos[0]:tokPos[1]+1].split("/"): 264 # newPos[0] += len(split) 265 # if 266 # Insert changed charOffset 267 counts["entities"] += 1 268 newOffset = tuple(tokPos) 269 newOffsetString = Range.tuplesToCharOffset([newOffset]) 270 if verbose: 271 print "Entity", entity.get("id"), 272 #print [entity.get("text"), sentenceText[headOffset[0]:headOffset[1]+1], sentenceText[newOffset[0]:newOffset[1]+1]], 273 print [entity.get("text"), sentenceText[headOffset[0]:headOffset[1]], sentenceText[newOffset[0]:newOffset[1]]], 274 print [entity.get("charOffset"), entity.get("headOffset"), newOffsetString], "Sent:", len(sentence.get("text")), 275 if newOffset != headOffset: 276 counts["extended"] += 1 277 if verbose: print "EXTENDED", 278 if newOffset == charOffset: 279 counts["correct"] += 1 280 if verbose: print "CORRECT" 281 else: 282 counts["incorrect"] += 1 283 incorrectCount += 1 284 if verbose: print "INCORRECT" 285 entity.set("charOffset", newOffsetString) 286 #entity.set("text", sentenceText[newOffset[0]:newOffset[1]+1]) 287 entity.set("text", sentenceText[newOffset[0]:newOffset[1]]) 288 if incorrectCount > 0 and verbose: 289 print "TOKENS:", "|".join(tokens) 290 print "--------------------------------" 291 if verbose: 292 print counts 293 294 if not (ET.iselement(input) and input.tag == "sentence"): 295 if output != None: 296 print >> sys.stderr, "Writing output to", output 297 ETUtils.write(corpusRoot, output) 298 return corpusTree

299 300 if __name__=="__main__": 301 print >> sys.stderr, "##### Extend Triggers #####" 302 # Import Psyco if available 303 try: 304 import psyco 305 psyco.full() 306 print >> sys.stderr, "Found Psyco, using" 307 except ImportError: 308 print >> sys.stderr, "Psyco not installed" 309 310 from optparse import OptionParser 311 optparser = OptionParser(usage="%prog [options]\nCreate an html visualization for a corpus.") 312 optparser.add_option("-i", "--input", default=None, dest="input", help="Corpus in analysis format", metavar="FILE") 313 optparser.add_option("-o", "--output", default=None, dest="output", help="Corpus in analysis format", metavar="FILE") 314 optparser.add_option("-d", "--debug", default=False, action="store_true", dest="debug", help="") 315 (options, args) = optparser.parse_args() 316 assert(options.input != None) 317 #assert(options.output != None) 318 319 extend(options.input, options.output, verbose=options.debug) 320

Source Code for Module TEES.Utils.InteractionXML.ExtendTriggers