Package TEES :: Package ExampleWriters :: Module SentenceExampleWriter
[hide private]

Source Code for Module TEES.ExampleWriters.SentenceExampleWriter

  1  """ 
  2  Base class for ExampleWriters working with interaction XML. 
  3  """ 
  4       
  5  import sys, os, types 
  6  import itertools 
  7  thisPath = os.path.dirname(os.path.abspath(__file__)) 
  8  sys.path.append(os.path.abspath(os.path.join(thisPath,".."))) 
  9  import Core.ExampleUtils as ExampleUtils 
 10  import Core.SentenceGraph as SentenceGraph 
 11  from Core.IdSet import IdSet 
 12  from Utils.ProgressCounter import ProgressCounter 
 13  try: 
 14      import xml.etree.cElementTree as ET 
 15  except ImportError: 
 16      import cElementTree as ET 
 17  import Utils.ElementTreeUtils as ETUtils 
 18  import Utils.InteractionXML.ResolveEPITriggerTypes as ResolveEPITriggerTypes 
 19  from collections import defaultdict 
 20   
21 -class SentenceExampleWriter:
22 """ 23 Base class for ExampleWriters working with interaction XML. 24 """ 25
26 - def __init__(self):
27 SentenceExampleWriter.counts = defaultdict(int)
28
29 - def write(self, examples, predictions, corpus, outputFile, classSet=None, parse=None, tokenization=None, goldCorpus=None, insertWeights=False, exampleStyle=None):
30 return self.writeXML(examples, predictions, corpus, outputFile, classSet, parse, tokenization, goldCorpus, exampleStyle=exampleStyle)
31
32 - def loadCorpus(self, corpus, parse, tokenization):
33 if type(corpus) == types.StringType or isinstance(corpus,ET.ElementTree): # corpus is in file 34 return SentenceGraph.loadCorpus(corpus, parse, tokenization) 35 else: 36 return corpus
37
38 - def loadExamples(self, examples, predictions):
39 if type(predictions) == types.StringType: 40 print >> sys.stderr, "Reading predictions from", predictions 41 predictions = ExampleUtils.loadPredictions(predictions) 42 if type(examples) == types.StringType: 43 print >> sys.stderr, "Reading examples from", examples 44 examples = ExampleUtils.readExamples(examples, False) 45 return examples, predictions
46
47 - def writeXML(self, examples, predictions, corpus, outputFile, classSet=None, parse=None, tokenization=None, goldCorpus=None, exampleStyle=None):
48 #print >> sys.stderr, "Writing output to Interaction XML" 49 corpus = self.loadCorpus(corpus, parse, tokenization) 50 if goldCorpus != None: 51 goldCorpus = self.loadCorpus(corpus, parse, tokenization) 52 examples, predictions = self.loadExamples(examples, predictions) 53 54 if type(classSet) == types.StringType: # class names are in file 55 classSet = IdSet(filename=classSet) 56 classIds = None 57 if classSet != None: 58 classIds = classSet.getIds() 59 60 #counter = ProgressCounter(len(corpus.sentences), "Write Examples") 61 62 exampleQueue = [] # One sentence's examples 63 predictionsByExample = {} 64 currentMajorId = None 65 prevMajorIds = set() 66 processedSentenceIds = set() 67 xType = None 68 69 count = 0 70 for example in examples: 71 count += 1 72 assert count > 0 73 progress = ProgressCounter(count, "Write Examples") 74 75 for example, prediction in itertools.izip_longest(examples, predictions): 76 assert example != None 77 assert prediction != None 78 majorId, minorId = example[0].rsplit(".x", 1) 79 #if currentMajorId == "GENIA.d114.s9": print "Start" 80 if majorId != currentMajorId: # new sentence 81 if currentMajorId != None: 82 #if currentMajorId == "GENIA.d114.s9": print "JAA" 83 processedSentenceIds.add(currentMajorId) 84 sentenceObject = corpus.sentencesById[currentMajorId] 85 goldSentence = None 86 if goldCorpus != None: 87 goldSentence = goldCorpus.sentencesById[currentMajorId] 88 self.writeXMLSentence(exampleQueue, predictionsByExample, sentenceObject, classSet, classIds, goldSentence=goldSentence, exampleStyle=exampleStyle) # process queue 89 progress.update(len(exampleQueue), "Writing examples ("+exampleQueue[-1][0]+"): ") 90 exampleQueue = [] 91 predictionsByExample = {} 92 prevMajorIds.add(currentMajorId) 93 assert majorId not in prevMajorIds, majorId 94 currentMajorId = majorId 95 exampleQueue.append(example) # queue example 96 predictionsByExample[example[0]] = prediction 97 assert example[3]["xtype"] == self.xType, str(example[3]["xtype"]) + "/" + str(self.xType) 98 99 # Process what is still in queue 100 if currentMajorId != None: 101 processedSentenceIds.add(currentMajorId) 102 sentenceObject = corpus.sentencesById[currentMajorId] 103 goldSentence = None 104 if goldCorpus != None: 105 goldSentence = goldCorpus.sentencesById[currentMajorId] 106 self.writeXMLSentence(exampleQueue, predictionsByExample, sentenceObject, classSet, classIds, goldSentence=goldSentence, exampleStyle=exampleStyle) # process queue 107 progress.update(len(exampleQueue), "Writing examples ("+exampleQueue[-1][0]+"): ") 108 exampleQueue = [] 109 predictionsByExample = {} 110 111 # Process sentences with no examples (e.g. to clear interactions) 112 for sentenceId in sorted(corpus.sentencesById.keys()): 113 if sentenceId not in processedSentenceIds: 114 sentenceObject = corpus.sentencesById[sentenceId] 115 goldSentence = None 116 if goldCorpus != None: 117 goldSentence = goldCorpus.sentencesById[currentMajorId] 118 self.writeXMLSentence([], {}, sentenceObject, classSet, classIds, goldSentence=goldSentence, exampleStyle=exampleStyle) 119 120 # Print statistics 121 if len(self.counts) > 0: 122 print >> sys.stderr, self.counts 123 self.counts = defaultdict(int) 124 125 # Write corpus 126 if outputFile != None: 127 print >> sys.stderr, "Writing corpus to", outputFile 128 ETUtils.write(corpus.rootElement, outputFile) 129 return corpus.tree
130
131 - def writeXMLSentence(self, examples, predictionsByExample, sentenceObject, classSet, classIds, goldSentence=None, exampleStyle=None):
132 raise NotImplementedError
133
134 - def assertSameSentence(self, examples, sentenceId=None):
135 currentSetMajorId = None 136 for example in examples: 137 majorId, minorId = example[0].rsplit(".x", 1) 138 if currentSetMajorId == None: 139 currentSetMajorId = majorId 140 else: 141 assert currentSetMajorId == majorId, str(currentSetMajorId) + "/" + str(majorId) 142 if sentenceId != None and len(examples) > 0: 143 assert sentenceId == currentSetMajorId, sentenceId + "/" + currentSetMajorId
144
145 - def removeChildren(self, element, childTags, childAttributes=None):
146 removed = [] 147 for tag in childTags: 148 childElements = element.findall(tag) 149 if childElements != None: 150 for childElement in childElements: 151 if childAttributes == None: 152 removed.append(childElement) 153 element.remove(childElement) 154 else: 155 removeElement = True 156 for k, v in childAttributes.iteritems(): 157 if childElement.get(k) != v: 158 removeElement = False 159 break 160 if removeElement: 161 removed.append(childElement) 162 element.remove(childElement) 163 return removed
164
165 - def removeNonNameEntities(self, sentenceElement):
166 """ 167 Removes non-name entities and returns number of entities 168 before removal. 169 """ 170 entityElements = sentenceElement.findall("entity") 171 removed = [] 172 if entityElements != None: 173 entityCount = len(entityElements) # get the count _before_ removing entities 174 for entityElement in entityElements: 175 if entityElement.get("isName") == "False": # interaction word 176 removed.append(entityElement) 177 sentenceElement.remove(entityElement) 178 return removed
179
180 - def isNegative(self, prediction, classSet=None):
181 if classSet == None: # binary classification 182 if prediction[0] > 0: 183 return False 184 else: 185 return True 186 else: 187 return classSet.getName(prediction[0]) == "neg"
188
189 - def getElementTypes(self, prediction, classSet=None, classIds=None, unmergeEPINegText=None):
190 if classSet == None: # binary classification 191 if prediction[0] > 0: 192 return [str(True)] 193 else: 194 return [str(False)] 195 else: 196 eTypes = classSet.getName(prediction[0]).split("---") # split merged types 197 if unmergeEPINegText != None: # an element text was provided 198 for i in range(len(eTypes)): 199 eTypes[i] = ResolveEPITriggerTypes.determineNewType(eTypes[i], unmergeEPINegText) 200 return eTypes
201
202 - def setElementType(self, element, prediction, classSet=None, classIds=None, unmergeEPINeg=False):
203 eText = element.get("text") 204 if classSet == None: # binary classification 205 if prediction[0] > 0: 206 element.attrib["type"] = str(True) 207 else: 208 element.attrib["type"] = str(False) 209 else: 210 if unmergeEPINeg: 211 element.set("type", ResolveEPITriggerTypes.determineNewType(classSet.getName(prediction[0]), eText)) 212 else: 213 element.attrib["type"] = classSet.getName(prediction[0]) 214 classWeights = prediction[1:] 215 predictionString = "" 216 for i in range(len(classWeights)): 217 if predictionString != "": 218 predictionString += "," 219 className = classSet.getName(classIds[i]) 220 if unmergeEPINeg: 221 className = InteractionXML.ResolveEPITriggerTypes.determineNewType(className, eText) 222 predictionString += className + ":" + str(classWeights[i]) 223 element.attrib["predictions"] = predictionString
224
225 - def getPredictionStrengthString(self, prediction, classSet, classIds, skipClasses=None):
226 classWeights = prediction[1:] 227 predictionString = "" 228 for i in range(len(classWeights)): 229 className = classSet.getName(classIds[i]) 230 if skipClasses != None and className in skipClasses: 231 continue 232 if predictionString != "": 233 predictionString += "," 234 predictionString += classSet.getName(classIds[i]) + ":" + str(classWeights[i]) 235 return predictionString
236