Package TEES :: Package Utils :: Package InteractionXML :: Module SentenceElements
[hide private]

Source Code for Module TEES.Utils.InteractionXML.SentenceElements

  1  import sys 
  2   
3 -def getCorpusIterator(input, output, parse=None, tokenization=None):
4 import cElementTreeUtils as ETUtils 5 from InteractionXML.SentenceElements import SentenceElements 6 #import xml.etree.cElementTree as ElementTree 7 8 etWriter = ETUtils.ETWriter(output) 9 for eTuple in ETUtils.ETIteratorFromObj(input, ("start", "end")): 10 element = eTuple[1] 11 if eTuple[0] == "end" and element.tag == "document": 12 sentences = [] 13 for sentenceElement in element.findall("sentence"): 14 #print ElementTree.tostring(sentenceElement) 15 sentence = SentenceElements(sentenceElement, parse, tokenization, removeIntersentenceInteractions=False) 16 sentences.append(sentence) 17 yield sentences 18 etWriter.write(element) 19 elif element.tag == "corpus": 20 if eTuple[0] == "start": 21 etWriter.begin(element) 22 else: 23 etWriter.end(element) 24 if eTuple[0] == "end" and element.tag in ["document", "corpus"]: 25 element.clear() 26 etWriter.close()
27
28 -class SentenceElements:
29 - def __init__(self, sentenceElement, parse=None, tokenization=None, removeIntersentenceInteractions=True, removeNameInfo=False, verbose=False):
30 self.sentence = sentenceElement 31 self.entities = [] 32 self.entitiesById = {} 33 self.pairs = [] 34 self.interactions = [] 35 self.tokens = [] 36 self.dependencies = [] 37 38 self.parseElement = None 39 self.tokenizationElement = None 40 41 sentenceId = sentenceElement.get("id") 42 pairElements = sentenceElement.findall("pair") 43 if pairElements != None: 44 self.pairs = pairElements 45 if removeIntersentenceInteractions: 46 pairsToKeep = [] 47 for pair in pairElements: 48 if pair.get("e1").rsplit(".",1)[0] == sentenceId and pair.get("e2").rsplit(".",1)[0] == sentenceId: 49 pairsToKeep.append(pair) 50 self.pairs = pairsToKeep 51 52 interactionElements = sentenceElement.findall("interaction") 53 if interactionElements != None: 54 self.interactions = interactionElements 55 self.interSentenceInteractions = [] 56 if removeIntersentenceInteractions: 57 interactionsToKeep = [] 58 for interaction in interactionElements: 59 e1rsplits = interaction.get("e1").count(".") - 2 60 e2rsplits = interaction.get("e2").count(".") - 2 61 if interaction.get("e1").rsplit(".",e1rsplits)[0] == sentenceId and interaction.get("e2").rsplit(".",e2rsplits)[0] == sentenceId: 62 interactionsToKeep.append(interaction) 63 else: 64 self.interSentenceInteractions.append(interaction) 65 self.interactions = interactionsToKeep 66 67 entityElements = sentenceElement.findall("entity") 68 if entityElements != None: 69 entitiesToKeep = [] 70 for entityElement in entityElements: 71 if entityElement.get("type") != "neg": 72 entitiesToKeep.append(entityElement) 73 entityElements = entitiesToKeep 74 self.entities = entityElements 75 for entityElement in entityElements: 76 if removeNameInfo: 77 entityElement.set("isName","False") 78 self.entitiesById[entityElement.attrib["id"]] = entityElement 79 80 sentenceAnalysesElement = sentenceElement.find("sentenceanalyses") 81 analysesElement = sentenceElement.find("analyses") 82 assert sentenceAnalysesElement == None or analysesElement == None, sentenceId 83 if sentenceAnalysesElement == None: 84 sentenceAnalysesElement = analysesElement 85 if sentenceAnalysesElement != None: 86 parsesElement = None 87 if parse != None: 88 # parsesElement = sentenceAnalysesElement.find("parses") 89 #if parsesElement != None: 90 parseElements = [x for x in sentenceAnalysesElement.getiterator("parse")] 91 #parseElements = parsesElement.findall("parse") 92 if len(parseElements) > 0: # new format 93 self.parseElement = None 94 for element in parseElements: 95 if element.get("parser") == parse: 96 self.parseElement = element 97 break 98 if self.parseElement != None: 99 tokenization = self.parseElement.get("tokenizer") 100 tokenizationElements = [x for x in sentenceAnalysesElement.getiterator("tokenization")] 101 #tokenizationsElement = sentenceAnalysesElement.find("tokenizations") 102 #tokenizationElements = tokenizationsElement.findall("tokenization") 103 for element in tokenizationElements: 104 if element.get("tokenizer") == tokenization: 105 self.tokenizationElement = element 106 break 107 else: # old format 108 if parse != None: 109 self.parseElement = parsesElement.find(parse) 110 if tokenization != None: 111 tokenizationsElement = sentenceAnalysesElement.find("tokenizations") 112 if tokenizationsElement != None: 113 self.tokenizationElement = tokenizationsElement.find(tokenization) 114 115 dependencyElements = None 116 if self.parseElement != None: 117 dependencyElements = self.parseElement.findall("dependency") 118 if dependencyElements != None: 119 self.dependencies = dependencyElements 120 else: 121 if verbose: 122 print >> sys.stderr, "Warning, parse", parse, "not found" 123 if self.tokenizationElement != None: 124 tokenElements = self.tokenizationElement.findall("token") 125 if tokenElements != None: 126 self.tokens = tokenElements 127 else: 128 if verbose: 129 print >> sys.stderr, "Warning, tokenization", tokenization, "not found"
130
131 - def getEntity(self, offset, offsetList, entityIds):
132 index = 0 133 for i in offsetList: 134 if (offset[0] >= i[0] and offset[0] <= i[1]) or (i[0] >= offset[0] and i[0] <= offset[1]): 135 #print offset, "list:", i 136 return entityIds[index] 137 index += 1 138 return None
139
140 - def getEntityTokens(self):
141 entityElements = self.entities 142 entityOffsets = [] 143 entityOffsetIds = [] 144 entityTokens = {} 145 for entityElement in entityElements: 146 if not entityTokens.has_key(entityElement.get("id")): 147 entityTokens[entityElement.get("id")] = [] 148 offsets = entityElement.get("charOffset").split(",") 149 for i in offsets: 150 offset = i.split("-") 151 offset[0] = int(offset[0]) 152 offset[1] = int(offset[1]) 153 entityOffsets.append(offset) 154 entityOffsetIds.append(entityElement.get("id")) 155 156 for tokenElement in self.tokens: 157 offset = tokenElement.get("charOffset").split("-") 158 offset[0] = int(offset[0]) 159 offset[1] = int(offset[1]) 160 id = tokenElement.get("id") 161 entityId = self.getEntity(offset, entityOffsets, entityOffsetIds) 162 if not entityTokens.has_key(entityId): 163 entityTokens[entityId] = [] 164 entityTokens[entityId].append(id) 165 166 return entityTokens
167