TEES.ExampleBuilders.FeatureBuilders.NameGazetteer

13

14 - def __init__(self):

15 self.names = {} 16 self.normalize = True

17

18 - def normalizeText(self, text):

19 return text.replace("-","").replace("/","").replace(",","").replace("\\","").replace(" ","").lower()

20 21 @classmethod

22 - def build(cls, input, output, parse, tokenization=None):

23 gaz = NameGazetteer() 24 gaz.fromXML(input, parse, tokenization) 25 gaz.save(output) 26 return gaz

27

28 - def addName(self, tokens, parent=None):

29 if len(tokens) == 0: 30 return 31 32 if parent == None: 33 parent = self.names 34 35 texts = [] 36 tokText = tokens[0] 37 if self.normalize: 38 tokText = self.normalizeText(tokText) 39 noDigits = tokText.replace("0","").replace("1","").replace("2","").replace("0","") 40 if len(texts[0]) > 2 and texts[0][-1].isdigit(): 41 texts.append(texts[0][:-1]) 42 for text in texts: 43 if not parent.has_key(text): 44 parent[text] = {} 45 if len(tokens) == 1: 46 parent[text][None] = None # mark string end 47 else: 48 self.addName(tokens[1:], parent[text])

49

50 - def save(self, output, parent=None, path=[]):

51 if type(output) == types.StringType: 52 output = open(output, "wt") 53 topLevel = False 54 if parent == None: 55 parent = self.names 56 topLevel = True 57 58 if parent.has_key(None) and len(path) > 0: 59 output.write("\t".join(path)+"\n") 60 for key in sorted(parent.keys()): 61 if key == None: 62 continue 63 self.save(output, parent[key], path+[key]) 64 if topLevel: 65 output.close()

66

67 - def fromXML(self, input, parse, tokenization=None):

68 self.names = {} 69 if type(input) == types.StringType: 70 corpus = CorpusElements.loadCorpus(input, parse, tokenization) 71 else: 72 corpus = input 73 for sentence in corpus.sentences: 74 tokenTuples = self.prepareTokens(sentence.tokens) 75 for entity in sentence.entities: 76 if entity.get("isName") == "True": 77 tokens = self.getTokens(entity, tokenTuples) 78 assert len(tokens) > 0 79 self.addName(tokens) 80 self.addName(["".join(tokens)])

81

82 - def prepareTokens(self, tokens):

83 tokenTuples = [] 84 for token in tokens: 85 tokenTuples.append( (Range.charOffsetToSingleTuple(token.get("charOffset")), token) ) 86 return tokenTuples

87

88 - def getTokens(self, entity, tokenTuples):

89 offset = entity.get("charOffset") 90 assert offset != None 91 offset = Range.charOffsetToSingleTuple(offset) 92 match = [] 93 for tokenTuple in tokenTuples: 94 if Range.overlap(offset, tokenTuple[0]): 95 match.append(tokenTuple[1].get("text")) 96 elif len(match) > 0: # passed end 97 break 98 return match

99

100 - def matchTokens(self, tokens, tokenIsName, nameDict=None, tokenSet=None, tokenChain=[]):

101 if len(tokens) == 0: 102 return 103 if tokenSet == None: 104 tokenSet = set() 105 if nameDict == None: 106 nameDict = self.names 107 108 token = tokens[0] 109 self.matchTokens(tokens[1:], tokenIsName, self.names) 110 if not tokenIsName[token]: 111 text = token.get("text") 112 assert text != None 113 if self.normalize: 114 text = self.normalizeText(text) 115 for key in nameDict.keys(): 116 if key == text: 117 if nameDict[key].has_key(None): # string end 118 for prevToken in tokenChain: 119 tokenSet.add(prevToken) 120 tokenSet.add(token) 121 self.matchTokens(tokens[1:], tokenIsName, nameDict[key], tokenSet, tokenChain+[token]) 122 return tokenSet

Source Code for Module TEES.ExampleBuilders.FeatureBuilders.NameGazetteer