1  import sys, os 
  2  from collections import defaultdict 
  3  extraPath = os.path.dirname(os.path.abspath(__file__))+"/../.." 
  4  sys.path.append(extraPath) 
  5  from Utils.ProgressCounter import ProgressCounter 
  6  try: 
  7      import xml.etree.cElementTree as ET 
  8  except ImportError: 
  9      import cElementTree as ET 
 10  import Utils.ElementTreeUtils as ETUtils 
 11  import Utils.Range as Range 
 12   
 14      fixCount = 0 
 15      phraseCount = 0 
 16      for phrase in phrases: 
 17          fixed = False 
 18          phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset")) 
 19          phraseBegin = int(phrase.get("begin")) 
 20          phraseEnd = int(phrase.get("end")) 
 21          for i in range(len(tokens)): 
 22              token = tokens[i] 
 23              tokOffset = Range.charOffsetToSingleTuple(token.get("charOffset")) 
 24              if tokOffset[0] == phraseOffset[0]: 
 25                  if phraseBegin != i: 
 26                      phrase.set("begin", str(i)) 
 27                      fixed = True 
 28              if tokOffset[1] == phraseOffset[1]: 
 29                  if phraseEnd != i: 
 30                      phrase.set("end", str(i)) 
 31                      fixed = True 
 32                  break 
 33          if fixed: 
 34              fixCount += 1 
 35          phraseCount += 1 
  36       
 37   
 39      phrases = parse.findall("phrase") 
 40      toKeep = [] 
 41      for phrase in phrases: 
 42          if phrase.get("charOffset") == None: 
 43              continue 
 44          if filter != None and phrase.get("type") not in filter: 
 45              continue 
 46          toKeep.append(phrase) 
 47      fixIndices(toKeep, tokens) 
 48      return toKeep 
  49   
 51      neOffsets = set() 
 52      for entity in entities: 
 53          if entity.get("isName") != "True": 
 54              continue 
 55          neOffsets.add(entity.get("charOffset")) 
 56      phrasesToKeep = [] 
 57      for phrase in phrases: 
 58          phraseOffset = phrase.get("charOffset") 
 59          if phraseOffset in neOffsets: 
 60              phraseOffsetTuple = Range.charOffsetToSingleTuple(phraseOffset) 
 61              if phraseOffsetTuple in phraseDict: 
 62                  del phraseDict[phraseOffsetTuple] 
 63          else: 
 64              phrasesToKeep.append(phrase) 
 65       
 66      return phrasesToKeep 
  67   
 69      phraseDict = {}    
 70       
 71      for phrase in phrases: 
 72          phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset")) 
 73          if not phraseDict.has_key(phraseOffset): 
 74              phraseDict[phraseOffset] = [] 
 75          phraseDict[phraseOffset].append(phrase) 
 76      return phraseDict 
  77   
 79      counts = {} 
 80      for phrase in phrases: 
 81          pType = phrase.get("type") 
 82          if pType not in counts: 
 83              counts[pType] = 0 
 84          counts[pType] += 1 
 85      return counts 
  86   
 88      e = ET.Element("phrase") 
 89      e.set("type", type) 
 90      e.set("begin", str(begin)) 
 91      e.set("end", str(end)) 
 92      e.set("charOffset", str(offset[0])+"-"+str(offset[1])) 
 93      return e 
  94   
 96      newPhrases = [] 
 97      for phrase in phrases: 
 98          if filter != None and phrase.get("type") not in filter: 
 99              continue 
100          phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset")) 
101          phraseBegin = int(phrase.get("begin")) 
102          phraseEnd = int(phrase.get("end")) 
103          prevToken = None 
104          tokCount = 0 
105          for token in tokens[phraseBegin:phraseEnd+1]: 
106              if token.get("POS") == "IN" and prevToken != None: 
107                  newPhraseOffset = (phraseOffset[0], Range.charOffsetToSingleTuple(prevToken.get("charOffset"))[-1]) 
108                  newPhrase = makePhrase(phrase.get("type") + "-IN", 
109                            newPhraseOffset,  
110                            phraseBegin,  
111                            phraseBegin + tokCount-1) 
112                  if not phraseDict.has_key(newPhraseOffset): 
113                       
114                      newPhrases.append(newPhrase) 
115                      phraseDict[newPhraseOffset] = [newPhrase] 
116              prevToken = token 
117              tokCount += 1 
118      return newPhrases 
 119   
121      newPhrases = [] 
122      for phrase in phrases: 
123          if filter != None and phrase.get("type") not in filter: 
124              continue 
125          phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset")) 
126          phraseBegin = int(phrase.get("begin")) 
127          phraseEnd = int(phrase.get("end")) 
128          if phraseBegin > 0 and tokens[phraseBegin-1].get("POS") == "DT": 
129              newPhraseOffset = (Range.charOffsetToSingleTuple(tokens[phraseBegin-1].get("charOffset"))[0], phraseOffset[1]) 
130              newPhrase = makePhrase("DT-" + phrase.get("type"), 
131                        newPhraseOffset,  
132                        phraseBegin - 1,  
133                        phraseEnd) 
134              if not phraseDict.has_key(newPhraseOffset): 
135                   
136                  newPhrases.append(newPhrase) 
137                  phraseDict[newPhraseOffset] = [newPhrase] 
138      return newPhrases 
 139   
141      newPhrases = [] 
142      for i in range(len(tokens)): 
143          token = tokens[i] 
144          tokPOS = token.get("POS") 
145          if tokPOS in includePOS: 
146              tokOffset = Range.charOffsetToSingleTuple(token.get("charOffset")) 
147              if not phraseDict.has_key(tokOffset): 
148                  newPhrase = makePhrase("TOK-t" + tokPOS, tokOffset, i, i) 
149                  newPhrases.append(newPhrase) 
150                  phraseDict[tokOffset] = [newPhrase] 
151      return newPhrases 
 152   
170       
171   
173      matches = [] 
174      if entity.get("isName") == "True": 
175          return [] 
176      maxOffset = Range.charOffsetToSingleTuple(entity.get("charOffset")) 
177      minOffset = entity.get("altOffset") 
178      if minOffset != None: 
179          minOffset = Range.charOffsetToSingleTuple(minOffset) 
180      else: 
181          minOffset = maxOffset 
182      for phraseOffset in phraseOffsets: 
183          if Range.contains(maxOffset, phraseOffset) and Range.contains(phraseOffset, minOffset): 
184              matches.extend(phraseDict[phraseOffset]) 
185      return matches 
 186   
197   
199      phraseOffsets = phraseDict.keys() 
200      phraseToEntity = {} 
201      for entity in entities: 
202          if entity.get("isName") == "True": 
203              continue 
204          matches = getMatchingPhrases(entity, phraseOffsets, phraseDict) 
205          if len(matches) == 1: 
206              bestMatch = matches[0] 
207          elif len(matches) == 0: 
208              bestMatch = None 
209          else: 
210              bestMatch = selectBestMatch(entity, matches) 
211          if bestMatch != None: 
212              if not phraseToEntity.has_key(bestMatch): 
213                  phraseToEntity[bestMatch] = [] 
214              phraseToEntity[bestMatch].append(entity) 
215      return phraseToEntity 
 216   
228   
230      print >> sys.stderr, "Loading corpus file", input 
231      corpusRoot = ETUtils.ETFromObj(input).getroot() 
232      documents = corpusRoot.findall("document") 
233   
234      counts = defaultdict(int) 
235      matchByType = defaultdict(lambda : [0,0]) 
236      filteredMatchByType = defaultdict(lambda : [0,0]) 
237      filter = set(["NP", "TOK-tIN", "WHADVP", "WHNP", "TOK-tWP$", "TOK-tPRP$", "NP-IN"]) 
238       
239   
240   
241   
242   
243   
244   
245   
246   
247   
248   
249   
250   
251   
252   
253   
254       
255       
256      for document in documents: 
257          for sentence in document.findall("sentence"): 
258              entities = sentence.findall("entity") 
259              parse = ETUtils.getElementByAttrib(sentence.find("sentenceanalyses"), "parse", {"parser":parserName}) 
260              if parse == None: 
261                  continue 
262              tokenization = ETUtils.getElementByAttrib(sentence.find("sentenceanalyses"), "tokenization", {"tokenizer":parse.get("tokenizer")}) 
263              phrases, phraseDict = makePhrases(parse, tokenization, entities) 
264              phraseOffsets = phraseDict.keys() 
265               
266              phraseNECounts = getNECounts(phrases, entities) 
267               
268              for value in phraseDict.values(): 
269                  counts["phrases"] += len(value) 
270                  for phrase in value: 
271                      matchByType[phrase.get("type")][0] += 1 
272                      if phrase.get("type") in filter: 
273                          filteredMatchByType[phrase.get("type")][0] += 1 
274                          counts["phrases-filtered"] += 1 
275                      if phrase.get("type").find("NP") != -1: 
276                          matchByType[phrase.get("type")+"_NE"+str(phraseNECounts[phrase])][0] += 1 
277              counts["tokens"] += len(tokenization.findall("token")) 
278               
279              corefType = {} 
280              for interaction in sentence.findall("interaction"): 
281                  if interaction.get("type") == "Coref": 
282                      corefType[interaction.get("e1")] = "Anaphora" 
283                      corefType[interaction.get("e2")] = "Antecedent" 
284               
285              for entity in entities: 
286                  if entity.get("isName") == "True": 
287                      continue 
288                  counts["entity"] += 1 
289                  print "entity", entity.get("id") 
290                  print ETUtils.toStr(entity) 
291                  matches = getMatchingPhrases(entity, phraseOffsets, phraseDict) 
292                  count = 0 
293                  filteredCount = 0 
294                  for phrase in matches: 
295                      cType = "UNKNOWN" 
296                      if corefType.has_key(entity.get("id")): 
297                          cType = corefType[entity.get("id")] 
298                      print "  match", count, ETUtils.toStr(phrase), "NE" + str(phraseNECounts[phrase]), "ctype:" + cType, "ent:" + ETUtils.toStr(entity) 
299                      count += 1 
300                      matchByType[phrase.get("type")][1] += 1 
301                      matchByType[phrase.get("type")+"_"+cType][1] += 1 
302                      matchByType[phrase.get("type")+"_"+cType+"_NE"+str(phraseNECounts[phrase])][1] += 1 
303                      if phrase.get("type") in filter: 
304                          filteredCount += 1 
305                          filteredMatchByType[phrase.get("type")][1] += 1 
306                   
307                  if count == 0: 
308                      print "  NO MATCH", ETUtils.toStr(entity) 
309                      counts["no-match"] += 1 
310                  else: 
311                      counts["match"] += 1 
312                   
313                  if len(matches) > 1: 
314                      bestMatch = selectBestMatch(entity, matches) 
315                      print "  MULTIMATCH("+ entity.get("charOffset")+","+str(entity.get("altOffset")) + ")", ", ".join([x.get("type") + "_" + x.get("charOffset") for x in matches]), "SEL(" + bestMatch.get("type") + "_" + bestMatch.get("charOffset") + ")" 
316                   
317                  if filteredCount == 0: counts["no-match-filtered"] += 1 
318                  else: counts["match-filtered"] += 1 
319      print "Match" 
320      for key in sorted(matchByType.keys()): 
321          print "  ", key, " ", matchByType[key] 
322      print "Filtered", filteredMatchByType 
323      print "Counts", counts 
 324   
325  if __name__=="__main__": 
326      import sys 
327      print >> sys.stderr, "##### Split elements with merged types #####" 
328       
329      from optparse import OptionParser 
330       
331      try: 
332          import psyco 
333          psyco.full() 
334          print >> sys.stderr, "Found Psyco, using" 
335      except ImportError: 
336          print >> sys.stderr, "Psyco not installed" 
337   
338      optparser = OptionParser(usage="%prog [options]\n") 
339      optparser.add_option("-i", "--input", default=None, dest="input", help="Corpus in interaction xml format", metavar="FILE") 
340      optparser.add_option("-p", "--parser", default="split-McClosky", dest="parser", help="") 
341      (options, args) = optparser.parse_args() 
342       
343      if options.input == None: 
344          print >> sys.stderr, "Error, input file not defined." 
345          optparser.print_help() 
346          sys.exit(1) 
347   
348      processCorpus(options.input, options.parser) 
349