1 __version__ = "$Revision: 1.1 $"
2
3 import sys, os, types
4 thisPath = os.path.dirname(os.path.abspath(__file__))
5 sys.path.append(os.path.abspath(os.path.join(thisPath,"../..")))
6 import Core.SentenceGraph as SentenceGraph
7 import Utils.ElementTreeUtils as ETUtils
8
9 import Utils.InteractionXML.CorpusElements as CorpusElements
10 import Utils.Range as Range
13
15 self.names = {}
16 self.normalize = True
17
18 - def normalizeText(self, text):
19 return text.replace("-","").replace("/","").replace(",","").replace("\\","").replace(" ","").lower()
20
21 @classmethod
22 - def build(cls, input, output, parse, tokenization=None):
27
28 - def addName(self, tokens, parent=None):
29 if len(tokens) == 0:
30 return
31
32 if parent == None:
33 parent = self.names
34
35 texts = []
36 tokText = tokens[0]
37 if self.normalize:
38 tokText = self.normalizeText(tokText)
39 noDigits = tokText.replace("0","").replace("1","").replace("2","").replace("0","")
40 if len(texts[0]) > 2 and texts[0][-1].isdigit():
41 texts.append(texts[0][:-1])
42 for text in texts:
43 if not parent.has_key(text):
44 parent[text] = {}
45 if len(tokens) == 1:
46 parent[text][None] = None
47 else:
48 self.addName(tokens[1:], parent[text])
49
50 - def save(self, output, parent=None, path=[]):
51 if type(output) == types.StringType:
52 output = open(output, "wt")
53 topLevel = False
54 if parent == None:
55 parent = self.names
56 topLevel = True
57
58 if parent.has_key(None) and len(path) > 0:
59 output.write("\t".join(path)+"\n")
60 for key in sorted(parent.keys()):
61 if key == None:
62 continue
63 self.save(output, parent[key], path+[key])
64 if topLevel:
65 output.close()
66
67 - def fromXML(self, input, parse, tokenization=None):
81
87
89 offset = entity.get("charOffset")
90 assert offset != None
91 offset = Range.charOffsetToSingleTuple(offset)
92 match = []
93 for tokenTuple in tokenTuples:
94 if Range.overlap(offset, tokenTuple[0]):
95 match.append(tokenTuple[1].get("text"))
96 elif len(match) > 0:
97 break
98 return match
99
100 - def matchTokens(self, tokens, tokenIsName, nameDict=None, tokenSet=None, tokenChain=[]):
101 if len(tokens) == 0:
102 return
103 if tokenSet == None:
104 tokenSet = set()
105 if nameDict == None:
106 nameDict = self.names
107
108 token = tokens[0]
109 self.matchTokens(tokens[1:], tokenIsName, self.names)
110 if not tokenIsName[token]:
111 text = token.get("text")
112 assert text != None
113 if self.normalize:
114 text = self.normalizeText(text)
115 for key in nameDict.keys():
116 if key == text:
117 if nameDict[key].has_key(None):
118 for prevToken in tokenChain:
119 tokenSet.add(prevToken)
120 tokenSet.add(token)
121 self.matchTokens(tokens[1:], tokenIsName, nameDict[key], tokenSet, tokenChain+[token])
122 return tokenSet
123
124 if __name__=="__main__":
125
126 try:
127 import psyco
128 psyco.full()
129 print >> sys.stderr, "Found Psyco, using"
130 except ImportError:
131 print >> sys.stderr, "Psyco not installed"
132
133 from optparse import OptionParser
134 import os
135 optparser = OptionParser(usage="%prog [options]\nCreate an html visualization for a corpus.")
136 optparser.add_option("-i", "--input", default=None, dest="input", help="Input file (interaction XML)")
137 optparser.add_option("-o", "--output", default=None, dest="output", help="Output file name")
138 optparser.add_option("-e", "--test", default=None, dest="test", help="")
139 optparser.add_option("-p", "--parse", default="split-McClosky", dest="parse", help="Parse XML element name")
140 optparser.add_option("-t", "--tokenization", default="split-McClosky", dest="tokenization", help="Tokenization XML element name")
141 (options, args) = optparser.parse_args()
142
143 corpus = SentenceGraph.loadCorpus(options.input, options.parse, options.tokenization)
144 gaz = NameGazetteer.build(corpus, options.output, options.parse, options.tokenization)
145
146 if options.test != None:
147 corpus = SentenceGraph.loadCorpus(options.test, options.parse, options.tokenization)
148 for sentence in corpus.sentences:
149 tokenSet = gaz.matchTokens(sentence.tokens, sentence.sentenceGraph.tokenIsName)
150 string = ""
151 for token in sentence.tokens:
152 chain = False
153 if token in tokenSet:
154 chain = True
155 if string != "":
156 string += "\t"
157 string += token.get("text")
158 elif chain:
159 string += "\n"
160 if chain:
161 string += "\n"
162 if string != "":
163 print sentence.sentence.get("id") + "\n" + string
164