Package TEES :: Package ExampleBuilders :: Package FeatureBuilders :: Module NameGazetteer
[hide private]

Source Code for Module TEES.ExampleBuilders.FeatureBuilders.NameGazetteer

  1  __version__ = "$Revision: 1.1 $" 
  2   
  3  import sys, os, types 
  4  thisPath = os.path.dirname(os.path.abspath(__file__)) 
  5  sys.path.append(os.path.abspath(os.path.join(thisPath,"../.."))) 
  6  import Core.SentenceGraph as SentenceGraph 
  7  import Utils.ElementTreeUtils as ETUtils 
  8  #from Utils.ProgressCounter import ProgressCounter 
  9  import Utils.InteractionXML.CorpusElements as CorpusElements 
 10  import Utils.Range as Range 
11 12 -class NameGazetteer:
13
14 - def __init__(self):
15 self.names = {} 16 self.normalize = True
17
18 - def normalizeText(self, text):
19 return text.replace("-","").replace("/","").replace(",","").replace("\\","").replace(" ","").lower()
20 21 @classmethod
22 - def build(cls, input, output, parse, tokenization=None):
23 gaz = NameGazetteer() 24 gaz.fromXML(input, parse, tokenization) 25 gaz.save(output) 26 return gaz
27
28 - def addName(self, tokens, parent=None):
29 if len(tokens) == 0: 30 return 31 32 if parent == None: 33 parent = self.names 34 35 texts = [] 36 tokText = tokens[0] 37 if self.normalize: 38 tokText = self.normalizeText(tokText) 39 noDigits = tokText.replace("0","").replace("1","").replace("2","").replace("0","") 40 if len(texts[0]) > 2 and texts[0][-1].isdigit(): 41 texts.append(texts[0][:-1]) 42 for text in texts: 43 if not parent.has_key(text): 44 parent[text] = {} 45 if len(tokens) == 1: 46 parent[text][None] = None # mark string end 47 else: 48 self.addName(tokens[1:], parent[text])
49
50 - def save(self, output, parent=None, path=[]):
51 if type(output) == types.StringType: 52 output = open(output, "wt") 53 topLevel = False 54 if parent == None: 55 parent = self.names 56 topLevel = True 57 58 if parent.has_key(None) and len(path) > 0: 59 output.write("\t".join(path)+"\n") 60 for key in sorted(parent.keys()): 61 if key == None: 62 continue 63 self.save(output, parent[key], path+[key]) 64 if topLevel: 65 output.close()
66
67 - def fromXML(self, input, parse, tokenization=None):
68 self.names = {} 69 if type(input) == types.StringType: 70 corpus = CorpusElements.loadCorpus(input, parse, tokenization) 71 else: 72 corpus = input 73 for sentence in corpus.sentences: 74 tokenTuples = self.prepareTokens(sentence.tokens) 75 for entity in sentence.entities: 76 if entity.get("isName") == "True": 77 tokens = self.getTokens(entity, tokenTuples) 78 assert len(tokens) > 0 79 self.addName(tokens) 80 self.addName(["".join(tokens)])
81
82 - def prepareTokens(self, tokens):
83 tokenTuples = [] 84 for token in tokens: 85 tokenTuples.append( (Range.charOffsetToSingleTuple(token.get("charOffset")), token) ) 86 return tokenTuples
87
88 - def getTokens(self, entity, tokenTuples):
89 offset = entity.get("charOffset") 90 assert offset != None 91 offset = Range.charOffsetToSingleTuple(offset) 92 match = [] 93 for tokenTuple in tokenTuples: 94 if Range.overlap(offset, tokenTuple[0]): 95 match.append(tokenTuple[1].get("text")) 96 elif len(match) > 0: # passed end 97 break 98 return match
99
100 - def matchTokens(self, tokens, tokenIsName, nameDict=None, tokenSet=None, tokenChain=[]):
101 if len(tokens) == 0: 102 return 103 if tokenSet == None: 104 tokenSet = set() 105 if nameDict == None: 106 nameDict = self.names 107 108 token = tokens[0] 109 self.matchTokens(tokens[1:], tokenIsName, self.names) 110 if not tokenIsName[token]: 111 text = token.get("text") 112 assert text != None 113 if self.normalize: 114 text = self.normalizeText(text) 115 for key in nameDict.keys(): 116 if key == text: 117 if nameDict[key].has_key(None): # string end 118 for prevToken in tokenChain: 119 tokenSet.add(prevToken) 120 tokenSet.add(token) 121 self.matchTokens(tokens[1:], tokenIsName, nameDict[key], tokenSet, tokenChain+[token]) 122 return tokenSet
123 124 if __name__=="__main__": 125 # Import Psyco if available 126 try: 127 import psyco 128 psyco.full() 129 print >> sys.stderr, "Found Psyco, using" 130 except ImportError: 131 print >> sys.stderr, "Psyco not installed" 132 133 from optparse import OptionParser 134 import os 135 optparser = OptionParser(usage="%prog [options]\nCreate an html visualization for a corpus.") 136 optparser.add_option("-i", "--input", default=None, dest="input", help="Input file (interaction XML)") 137 optparser.add_option("-o", "--output", default=None, dest="output", help="Output file name") 138 optparser.add_option("-e", "--test", default=None, dest="test", help="") 139 optparser.add_option("-p", "--parse", default="split-McClosky", dest="parse", help="Parse XML element name") 140 optparser.add_option("-t", "--tokenization", default="split-McClosky", dest="tokenization", help="Tokenization XML element name") 141 (options, args) = optparser.parse_args() 142 143 corpus = SentenceGraph.loadCorpus(options.input, options.parse, options.tokenization) 144 gaz = NameGazetteer.build(corpus, options.output, options.parse, options.tokenization) 145 146 if options.test != None: 147 corpus = SentenceGraph.loadCorpus(options.test, options.parse, options.tokenization) 148 for sentence in corpus.sentences: 149 tokenSet = gaz.matchTokens(sentence.tokens, sentence.sentenceGraph.tokenIsName) 150 string = "" 151 for token in sentence.tokens: 152 chain = False 153 if token in tokenSet: 154 chain = True 155 if string != "": 156 string += "\t" 157 string += token.get("text") 158 elif chain: 159 string += "\n" 160 if chain: 161 string += "\n" 162 if string != "": 163 print sentence.sentence.get("id") + "\n" + string 164