1 """
2 Base class for ExampleWriters working with interaction XML.
3 """
4
5 import sys, os, types
6 import itertools
7 thisPath = os.path.dirname(os.path.abspath(__file__))
8 sys.path.append(os.path.abspath(os.path.join(thisPath,"..")))
9 import Core.ExampleUtils as ExampleUtils
10 import Core.SentenceGraph as SentenceGraph
11 from Core.IdSet import IdSet
12 from Utils.ProgressCounter import ProgressCounter
13 try:
14 import xml.etree.cElementTree as ET
15 except ImportError:
16 import cElementTree as ET
17 import Utils.ElementTreeUtils as ETUtils
18 import Utils.InteractionXML.ResolveEPITriggerTypes as ResolveEPITriggerTypes
19 from collections import defaultdict
20
22 """
23 Base class for ExampleWriters working with interaction XML.
24 """
25
28
29 - def write(self, examples, predictions, corpus, outputFile, classSet=None, parse=None, tokenization=None, goldCorpus=None, insertWeights=False, exampleStyle=None):
30 return self.writeXML(examples, predictions, corpus, outputFile, classSet, parse, tokenization, goldCorpus, exampleStyle=exampleStyle)
31
32 - def loadCorpus(self, corpus, parse, tokenization):
33 if type(corpus) == types.StringType or isinstance(corpus,ET.ElementTree):
34 return SentenceGraph.loadCorpus(corpus, parse, tokenization)
35 else:
36 return corpus
37
39 if type(predictions) == types.StringType:
40 print >> sys.stderr, "Reading predictions from", predictions
41 predictions = ExampleUtils.loadPredictions(predictions)
42 if type(examples) == types.StringType:
43 print >> sys.stderr, "Reading examples from", examples
44 examples = ExampleUtils.readExamples(examples, False)
45 return examples, predictions
46
47 - def writeXML(self, examples, predictions, corpus, outputFile, classSet=None, parse=None, tokenization=None, goldCorpus=None, exampleStyle=None):
48
49 corpus = self.loadCorpus(corpus, parse, tokenization)
50 if goldCorpus != None:
51 goldCorpus = self.loadCorpus(corpus, parse, tokenization)
52 examples, predictions = self.loadExamples(examples, predictions)
53
54 if type(classSet) == types.StringType:
55 classSet = IdSet(filename=classSet)
56 classIds = None
57 if classSet != None:
58 classIds = classSet.getIds()
59
60
61
62 exampleQueue = []
63 predictionsByExample = {}
64 currentMajorId = None
65 prevMajorIds = set()
66 processedSentenceIds = set()
67 xType = None
68
69 count = 0
70 for example in examples:
71 count += 1
72 assert count > 0
73 progress = ProgressCounter(count, "Write Examples")
74
75 for example, prediction in itertools.izip_longest(examples, predictions):
76 assert example != None
77 assert prediction != None
78 majorId, minorId = example[0].rsplit(".x", 1)
79
80 if majorId != currentMajorId:
81 if currentMajorId != None:
82
83 processedSentenceIds.add(currentMajorId)
84 sentenceObject = corpus.sentencesById[currentMajorId]
85 goldSentence = None
86 if goldCorpus != None:
87 goldSentence = goldCorpus.sentencesById[currentMajorId]
88 self.writeXMLSentence(exampleQueue, predictionsByExample, sentenceObject, classSet, classIds, goldSentence=goldSentence, exampleStyle=exampleStyle)
89 progress.update(len(exampleQueue), "Writing examples ("+exampleQueue[-1][0]+"): ")
90 exampleQueue = []
91 predictionsByExample = {}
92 prevMajorIds.add(currentMajorId)
93 assert majorId not in prevMajorIds, majorId
94 currentMajorId = majorId
95 exampleQueue.append(example)
96 predictionsByExample[example[0]] = prediction
97 assert example[3]["xtype"] == self.xType, str(example[3]["xtype"]) + "/" + str(self.xType)
98
99
100 if currentMajorId != None:
101 processedSentenceIds.add(currentMajorId)
102 sentenceObject = corpus.sentencesById[currentMajorId]
103 goldSentence = None
104 if goldCorpus != None:
105 goldSentence = goldCorpus.sentencesById[currentMajorId]
106 self.writeXMLSentence(exampleQueue, predictionsByExample, sentenceObject, classSet, classIds, goldSentence=goldSentence, exampleStyle=exampleStyle)
107 progress.update(len(exampleQueue), "Writing examples ("+exampleQueue[-1][0]+"): ")
108 exampleQueue = []
109 predictionsByExample = {}
110
111
112 for sentenceId in sorted(corpus.sentencesById.keys()):
113 if sentenceId not in processedSentenceIds:
114 sentenceObject = corpus.sentencesById[sentenceId]
115 goldSentence = None
116 if goldCorpus != None:
117 goldSentence = goldCorpus.sentencesById[currentMajorId]
118 self.writeXMLSentence([], {}, sentenceObject, classSet, classIds, goldSentence=goldSentence, exampleStyle=exampleStyle)
119
120
121 if len(self.counts) > 0:
122 print >> sys.stderr, self.counts
123 self.counts = defaultdict(int)
124
125
126 if outputFile != None:
127 print >> sys.stderr, "Writing corpus to", outputFile
128 ETUtils.write(corpus.rootElement, outputFile)
129 return corpus.tree
130
131 - def writeXMLSentence(self, examples, predictionsByExample, sentenceObject, classSet, classIds, goldSentence=None, exampleStyle=None):
132 raise NotImplementedError
133
135 currentSetMajorId = None
136 for example in examples:
137 majorId, minorId = example[0].rsplit(".x", 1)
138 if currentSetMajorId == None:
139 currentSetMajorId = majorId
140 else:
141 assert currentSetMajorId == majorId, str(currentSetMajorId) + "/" + str(majorId)
142 if sentenceId != None and len(examples) > 0:
143 assert sentenceId == currentSetMajorId, sentenceId + "/" + currentSetMajorId
144
146 removed = []
147 for tag in childTags:
148 childElements = element.findall(tag)
149 if childElements != None:
150 for childElement in childElements:
151 if childAttributes == None:
152 removed.append(childElement)
153 element.remove(childElement)
154 else:
155 removeElement = True
156 for k, v in childAttributes.iteritems():
157 if childElement.get(k) != v:
158 removeElement = False
159 break
160 if removeElement:
161 removed.append(childElement)
162 element.remove(childElement)
163 return removed
164
166 """
167 Removes non-name entities and returns number of entities
168 before removal.
169 """
170 entityElements = sentenceElement.findall("entity")
171 removed = []
172 if entityElements != None:
173 entityCount = len(entityElements)
174 for entityElement in entityElements:
175 if entityElement.get("isName") == "False":
176 removed.append(entityElement)
177 sentenceElement.remove(entityElement)
178 return removed
179
181 if classSet == None:
182 if prediction[0] > 0:
183 return False
184 else:
185 return True
186 else:
187 return classSet.getName(prediction[0]) == "neg"
188
189 - def getElementTypes(self, prediction, classSet=None, classIds=None, unmergeEPINegText=None):
190 if classSet == None:
191 if prediction[0] > 0:
192 return [str(True)]
193 else:
194 return [str(False)]
195 else:
196 eTypes = classSet.getName(prediction[0]).split("---")
197 if unmergeEPINegText != None:
198 for i in range(len(eTypes)):
199 eTypes[i] = ResolveEPITriggerTypes.determineNewType(eTypes[i], unmergeEPINegText)
200 return eTypes
201
202 - def setElementType(self, element, prediction, classSet=None, classIds=None, unmergeEPINeg=False):
203 eText = element.get("text")
204 if classSet == None:
205 if prediction[0] > 0:
206 element.attrib["type"] = str(True)
207 else:
208 element.attrib["type"] = str(False)
209 else:
210 if unmergeEPINeg:
211 element.set("type", ResolveEPITriggerTypes.determineNewType(classSet.getName(prediction[0]), eText))
212 else:
213 element.attrib["type"] = classSet.getName(prediction[0])
214 classWeights = prediction[1:]
215 predictionString = ""
216 for i in range(len(classWeights)):
217 if predictionString != "":
218 predictionString += ","
219 className = classSet.getName(classIds[i])
220 if unmergeEPINeg:
221 className = InteractionXML.ResolveEPITriggerTypes.determineNewType(className, eText)
222 predictionString += className + ":" + str(classWeights[i])
223 element.attrib["predictions"] = predictionString
224
226 classWeights = prediction[1:]
227 predictionString = ""
228 for i in range(len(classWeights)):
229 className = classSet.getName(classIds[i])
230 if skipClasses != None and className in skipClasses:
231 continue
232 if predictionString != "":
233 predictionString += ","
234 predictionString += classSet.getName(classIds[i]) + ":" + str(classWeights[i])
235 return predictionString
236