TEES.Utils.DetectHeads

1 import sys, os 2 sys.path.append(os.path.dirname(os.path.abspath(__file__))+"/..") 3 import Core.SentenceGraph as SentenceGraph 4 from Utils.ProgressCounter import ProgressCounter 5 from FindHeads import findHeads 6 import Utils.ElementTreeUtils as ETUtils 7 import Utils.InteractionXML.CorpusElements 8 import Utils.Range as Range 9 import Utils.Libraries.PorterStemmer as PorterStemmer 10

11 -def getTriggers(corpus):

12 """ 13 Returns a dictionary of "entity type"->"entity text"->"count" 14 """ 15 corpus = ETUtils.ETFromObj(corpus) 16 trigDict = {} 17 for entity in corpus.getroot().getiterator("entity"): 18 if entity.get("isName") == "True": 19 continue 20 eType = entity.get("type") 21 if not trigDict.has_key(eType): 22 trigDict[eType] = {} 23 eText = entity.get("text") 24 eText = PorterStemmer.stem(eText) 25 if not trigDict[eType].has_key(eText): 26 trigDict[eType][eText] = 0 27 trigDict[eType][eText] += 1 28 return trigDict

29

30 -def getDistribution(trigDict):

31 """ 32 Converts a dictionary of "entity type"->"entity text"->"count" 33 to "entity text"->"entity type"->"(count, fraction)" 34 """ 35 distDict = {} 36 eTypes = trigDict.keys() 37 for eType in trigDict.keys(): 38 for string in trigDict[eType].keys(): 39 if not distDict.has_key(string): 40 distDict[string] = {} 41 for e in eTypes: 42 distDict[string][e] = [0, None] 43 distDict[string][eType] = [trigDict[eType][string], None] 44 # define ratios 45 for string in distDict.keys(): 46 count = 0.0 47 for eType in distDict[string].keys(): 48 count += distDict[string][eType][0] 49 for eType in distDict[string].keys(): 50 distDict[string][eType][1] = distDict[string][eType][0] / count 51 return distDict

52

53 -def getHeads(corpus):

54 corpus = ETUtils.ETFromObj(corpus) 55 headDict = {} 56 headDict["None"] = {} 57 for sentence in corpus.getiterator("sentence"): 58 headOffsetStrings = set() 59 for entity in sentence.findall("entity"): 60 eType = entity.get("type") 61 if not headDict.has_key(eType): 62 headDict[eType] = {} 63 eText = entity.get("text") 64 headOffset = entity.get("headOffset") 65 headOffsetStrings.add(headOffset) 66 headOffset = Range.charOffsetToSingleTuple(headOffset) 67 charOffset = Range.charOffsetToSingleTuple(entity.get("charOffset")) 68 if headOffset == charOffset: 69 if not headDict[eType].has_key(eText): headDict[eType][eText] = 0 70 headDict[eType][eText] += 1 71 else: 72 headText = sentenceText[headOffset[0]-charOffset[0]:headOffset[1]-charOffset[0]+1] 73 if not headDict[eType].has_key(headText): headDict[eType][headText] = 0 74 headDict[eType][headText] += 1 75 for token in tokens: 76 if not token.get("charOffset") in headOffsetStrings: # token is not the head of any entity 77 headText = token.get("text") 78 if not headDict["None"].has_key(headText): headDict["None"][headText] = 0 79 headDict["None"][headText] += 1 80 81 return headDict

82

83 -def getOverlap():

84 pass

85

86 -def removeHeads(corpus):

87 print >> sys.stderr, "Removing existing head offsets" 88 removeCount = 0 89 xml = ETUtils.ETFromObj(corpus) 90 for d in xml.getroot().findall("document"): 91 for s in d.findall("sentence"): 92 for e in s.findall("entity"): 93 if e.get("headOffset") != None: 94 removeCount += 1 95 del e.attrib["headOffset"] 96 print >> sys.stderr, "Removed head offsets from", removeCount, "entities" 97 return [0, removeCount]

98

99 -def findHeads(corpus, stringsFrom, methods, parse, tokenization):

100 for m in methods: 101 assert m in ["REMOVE", "SYNTAX", "DICT"] 102 corpus = ETUtils.ETFromObj(corpus) 103 counts = {} 104 for method in methods: 105 print >> sys.stderr, method, "pass" 106 if method == "REMOVE": 107 counts[method] = removeHeads(corpus) 108 elif method == "DICT": 109 counts[method] = findHeadsDictionary(corpus, stringsFrom, parse, tokenization) 110 elif method == "SYNTAX": 111 counts[method] = findHeadsSyntactic(corpus, parse, tokenization) 112 print >> sys.stderr, method, "pass added", counts[method][0], "and removed", counts[method][1], "heads" 113 114 print >> sys.stderr, "Summary (pass/added/removed):" 115 for method in methods: 116 print >> sys.stderr, " ", method, "/", counts[method][0], "/", counts[method][1]

117

118 -def mapSplits(splits, string, stringOffset):

119 """ 120 Maps substrings to a string, and stems them 121 """ 122 begin = 0 123 tuples = [] 124 for split in splits: 125 offset = string.find(split, begin) 126 assert offset != -1 127 tuples.append( (split, PorterStemmer.stem(split), (offset,len(split))) ) 128 begin = offset + len(split) 129 return tuples

130

131 -def findHeadsDictionary(corpus, stringsFrom, parse, tokenization):

132 print "Extracting triggers from", stringsFrom 133 trigDict = getTriggers(stringsFrom) 134 print "Determining trigger distribution" 135 distDict = getDistribution(trigDict) 136 allStrings = sorted(distDict.keys()) 137 print "Determining heads for", corpus 138 corpusElements = Utils.InteractionXML.CorpusElements.loadCorpus(corpus, parse, tokenization, removeIntersentenceInteractions=False, removeNameInfo=False) 139 cases = {} 140 counts = [0,0] 141 for sentence in corpusElements.sentences: 142 #print sentence.sentence.get("id") 143 sText = sentence.sentence.get("text") 144 #tokenHeadScores = None 145 for entity in sentence.entities: 146 if entity.get("headOffset") != None: 147 continue 148 if entity.get("isName") == "True": # Only for triggers 149 continue 150 #if tokenHeadScores == None: 151 # tokenHeadScores = getTokenHeadScores(sentence.tokens, sentence.dependencies, sentenceId=sentence.sentence.get("id")) 152 eText = entity.get("text") 153 eType = entity.get("type") 154 eOffset = Range.charOffsetToSingleTuple(entity.get("charOffset")) 155 wsSplits = eText.split() # Split by whitespace 156 if len(wsSplits) == 1 and eText.find("-") == -1: # unambiguous head will be assigned by SYNTAX pass 157 continue 158 else: # Entity text has multiple (whitespace or hyphen separated) parts 159 candidates = [] 160 # Try to find entity substring in individual entity strings 161 for wsTuple in mapSplits(wsSplits, eText, eOffset): 162 if not distDict.has_key(wsTuple[1]): # string not found, low score 163 candidates.append( ((-1, -1), wsTuple[2], wsTuple[0], wsTuple[1]) ) 164 else: # String found, more common ones get higher score 165 assert distDict[wsTuple[1]].has_key(eType), (distDict[wsTuple[0]], wsTuple[0], eText) 166 candidates.append( (tuple(distDict[wsTuple[1]][eType]), wsTuple[2], wsTuple[0], wsTuple[1]) ) 167 # Split each whitespace-separated string further into hyphen-separated substrings 168 for candidate in candidates[:]: 169 hyphenSplits = candidate[2].split("-") 170 if len(hyphenSplits) > 1: # Substring has a hyphen 171 # Try to find entity substring in individual entity strings 172 for hyphenTuple in mapSplits(hyphenSplits, eText, candidate[1]): 173 if not distDict.has_key(hyphenTuple[1]): 174 candidates.append( ((-1, -1), hyphenTuple[2], hyphenTuple[0], hyphenTuple[1]) ) 175 else: 176 candidates.append( (tuple(distDict[hyphenTuple[1]][eType]), hyphenTuple[2], hyphenTuple[0], hyphenTuple[1]) ) 177 # Sort candidates, highes scores come first 178 candidates.sort(reverse=True) 179 # If not matches, look for substrings inside words 180 if candidates[0][0][0] in [-1, 0]: # no matches, look for substrings 181 print "Substring matching", candidates, "for entity", entity.get("id") 182 for i in range(len(candidates)): 183 candidate = candidates[i] 184 cText = candidate[2] 185 for string in allStrings: 186 subStringPos = cText.find(string) 187 if subStringPos != -1: 188 print " Substring match", string, cText, 189 score = tuple(distDict[string][eType]) 190 if score > candidate[0]: 191 print score, candidate[0], "Substring selected" #, score > candidate[0], score < candidate[0] 192 subStringCoords = [candidate[1][0] + subStringPos, len(string)] 193 candidate = (score, subStringCoords, candidate[2], ">"+string+"<") 194 else: 195 print score, candidate[0] 196 candidates[i] = candidate 197 # Resort after possibly replacing some candidates 198 candidates.sort(reverse=True) 199 if candidates[0][0][0] not in [-1, 0]: # if it is in [-1, 0], let SYNTAX pass take care of it 200 candidateOffset = (candidates[0][1][0] + eOffset[0], candidates[0][1][0] + candidates[0][1][1] + eOffset[0]) 201 entity.set("headOffset", str(candidateOffset[0]) + "-" + str(candidateOffset[1]-1)) 202 entity.set("headMethod", "Dict") 203 entity.set("headString", sText[candidateOffset[0]:candidateOffset[1]]) 204 counts[0] += 1 205 # Prepare results for printing 206 for i in range(len(candidates)): 207 c = candidates[i] 208 candidates[i] = (tuple(c[0]), c[2], c[3]) 209 case = (eType, eText, tuple(candidates)) 210 if not cases.has_key(case): 211 cases[case] = 0 212 cases[case] += 1 213 print entity.get("id"), eType + ": '" + eText + "'", candidates 214 #headToken = getEntityHeadToken(entity, sentence.tokens, tokenHeadScores) 215 # The ElementTree entity-element is modified by setting the headOffset attribute 216 #entity.set("headOffset", headToken.get("charOffset")) 217 #entity.set("headMethod", "Syntax") 218 print "Cases" 219 for case in sorted(cases.keys()): 220 print case, cases[case] 221 #return corpus 222 return counts

223

224 -def findHeadsSyntactic(corpus, parse, tokenization):

225 """ 226 Determine the head token for a named entity or trigger. The head token is the token closest 227 to the root for the subtree of the dependency parse spanned by the text of the element. 228 229 @param entityElement: a semantic node (trigger or named entity) 230 @type entityElement: cElementTree.Element 231 @param verbose: Print selected head tokens on screen 232 @param verbose: boolean 233 """ 234 counts = [0,0] 235 sentences = [x for x in corpus.getiterator("sentence")] 236 counter = ProgressCounter(len(sentences), "SYNTAX") 237 for sentence in sentences: 238 counter.update() 239 tokElement = ETUtils.getElementByAttrib(sentence, "sentenceanalyses/tokenizations/tokenization", {"tokenizer":tokenization}) 240 parseElement = ETUtils.getElementByAttrib(sentence, "sentenceanalyses/parses/parse", {"parser":parse}) 241 if tokElement == None or parseElement == None: 242 print >> sys.stderr, "Warning, sentence", sentence.get("id"), "missing parse or tokenization" 243 tokens = tokElement.findall("token") 244 tokenHeadScores = getTokenHeadScores(tokens, parseElement.findall("dependency"), sentenceId=sentence.get("id")) 245 for entity in sentence.findall("entity"): 246 if entity.get("headOffset") == None: 247 headToken = getEntityHeadToken(entity, tokens, tokenHeadScores) 248 # The ElementTree entity-element is modified by setting the headOffset attribute 249 entity.set("headOffset", headToken.get("charOffset")) 250 entity.set("headMethod", "Syntax") 251 entity.set("headString", headToken.get("text")) 252 counts[0] += 1 253 return counts

254

255 -def getEntityHeadToken(entity, tokens, tokenHeadScores):

256 if entity.get("headOffset") != None: 257 charOffsets = Range.charOffsetToTuples(entity.get("headOffset")) 258 elif entity.get("charOffset") != "": 259 charOffsets = Range.charOffsetToTuples(entity.get("charOffset")) 260 else: 261 charOffsets = [] 262 # Each entity can consist of multiple syntactic tokens, covered by its 263 # charOffset-range. One of these must be chosen as the head token. 264 headTokens = [] # potential head tokens 265 for token in tokens: 266 tokenOffset = Range.charOffsetToSingleTuple(token.get("charOffset")) 267 for offset in charOffsets: 268 if Range.overlap(offset, tokenOffset): 269 headTokens.append(token) 270 if len(headTokens)==1: # An unambiguous head token was found 271 selectedHeadToken = headTokens[0] 272 else: # One head token must be chosen from the candidates 273 selectedHeadToken = findHeadToken(headTokens, tokenHeadScores) 274 #if verbose: 275 # print >> sys.stderr, "Selected head:", token.attrib["id"], token.attrib["text"] 276 assert selectedHeadToken != None, entityElement.get("id") 277 return selectedHeadToken

278

279 -def findHeadToken(candidateTokens, tokenHeadScores):

280 """ 281 Select the candidate token that is closest to the root of the subtree of the depencdeny parse 282 to which the candidate tokens belong to. See getTokenHeadScores method for the algorithm. 283 284 @param candidateTokens: the list of syntactic tokens from which the head token is selected 285 @type candidateTokens: list of cElementTree.Element objects 286 """ 287 if len(candidateTokens) == 0: 288 return None 289 290 highestScore = -9999999 291 bestTokens = [] 292 for token in candidateTokens: 293 if tokenHeadScores[token] > highestScore: 294 highestScore = tokenHeadScores[token] 295 for token in candidateTokens: 296 if tokenHeadScores[token] == highestScore: 297 bestTokens.append(token) 298 return bestTokens[-1]

299

300 -def getTokenHeadScores(tokens, dependencies, sentenceId=None):

301 """ 302 A head token is chosen using a heuristic that prefers tokens closer to the 303 root of the dependency parse. In a list of candidate tokens, the one with 304 the highest score is the head token. The return value of this method 305 is a dictionary that maps token elements to their scores. 306 """ 307 tokenHeadScores = {} 308 309 # Give all tokens initial scores 310 for token in tokens: 311 tokenHeadScores[token] = 0 # initialize score as zero (unconnected token) 312 for dependency in dependencies: 313 if dependency.get("t1") == token.get("id") or dependency.get("t2") == token.get("id"): 314 tokenHeadScores[token] = 1 # token is connected by a dependency 315 break 316 317 # Give a low score for tokens that clearly can't be head and are probably produced by hyphen-splitter 318 for token in tokens: 319 tokenText = token.get("text") 320 if tokenText == "\\" or tokenText == "/" or tokenText == "-": 321 tokenHeadScores[token] = -1 322 323 # Loop over all dependencies and increase the scores of all governor tokens 324 # until each governor token has a higher score than its dependent token. 325 # Some dependencies might form a loop so a list is used to define those 326 # dependency types used in determining head scores. 327 depTypesToInclude = ["prep", "nn", "det", "hyphen", "num", "amod", "nmod", "appos", "measure", "dep", "partmod"] 328 #depTypesToRemoveReverse = ["A/AN"] 329 modifiedScores = True 330 loopCount = 0 # loopcount for devel set approx. 2-4 331 while modifiedScores == True: # loop until the scores no longer change 332 if loopCount > 20: # survive loops 333 print >> sys.stderr, "Warning, possible loop in parse for sentence", sentenceId 334 break 335 modifiedScores = False 336 for token1 in tokens: 337 for token2 in tokens: # for each combination of tokens... 338 for dep in dependencies: # ... check each dependency 339 if dep.get("t1") == token1.get("id") and dep.get("t2") == token2.get("id") and (dep.get("type") in depTypesToInclude): 340 # The governor token of the dependency must have a higher score 341 # than the dependent token. 342 if tokenHeadScores[token1] <= tokenHeadScores[token2]: 343 tokenHeadScores[token1] = tokenHeadScores[token2] + 1 344 modifiedScores = True 345 loopCount += 1 346 return tokenHeadScores

347 348 if __name__=="__main__": 349 import sys 350 print >> sys.stderr, "##### Calculating entity head token offsets #####" 351 352 from optparse import OptionParser 353 # Import Psyco if available 354 try: 355 import psyco 356 psyco.full() 357 print >> sys.stderr, "Found Psyco, using" 358 except ImportError: 359 print >> sys.stderr, "Psyco not installed" 360 361 optparser = OptionParser(usage="%prog [options]\nRecalculate head token offsets.") 362 optparser.add_option("-i", "--input", default=None, dest="input", help="Corpus in interaction xml format", metavar="FILE") 363 optparser.add_option("-o", "--output", default=None, dest="output", help="Output file in interaction xml format.") 364 optparser.add_option("-d", "--dictionary", default=None, dest="dictionary", help="Corpus file to use as dictionary of entity strings.") 365 optparser.add_option("-m", "--methods", default=None, dest="methods", help="") 366 optparser.add_option("-p", "--parse", default="split-McClosky", dest="parse", help="Parse element name for calculating head offsets") 367 optparser.add_option("-t", "--tokenization", default="split-McClosky", dest="tokenization", help="Tokenization element name for calculating head offsets") 368 (options, args) = optparser.parse_args() 369 370 print >> sys.stderr, "Loading corpus" 371 corpus = ETUtils.ETFromObj(options.input) 372 print >> sys.stderr, "Finding heads" 373 findHeads(corpus, options.dictionary, ["REMOVE", "DICT", "SYNTAX"], options.parse, options.tokenization) 374 #findHeadsDictionary(corpus, options.parse, options.tokenization) 375 if options.output != None: 376 print >> sys.stderr, "Writing corpus" 377 ETUtils.write(corpus, options.output) 378

Source Code for Module TEES.Utils.DetectHeads