1 import sys
2
4 import cElementTreeUtils as ETUtils
5 from InteractionXML.SentenceElements import SentenceElements
6
7
8 etWriter = ETUtils.ETWriter(output)
9 for eTuple in ETUtils.ETIteratorFromObj(input, ("start", "end")):
10 element = eTuple[1]
11 if eTuple[0] == "end" and element.tag == "document":
12 sentences = []
13 for sentenceElement in element.findall("sentence"):
14
15 sentence = SentenceElements(sentenceElement, parse, tokenization, removeIntersentenceInteractions=False)
16 sentences.append(sentence)
17 yield sentences
18 etWriter.write(element)
19 elif element.tag == "corpus":
20 if eTuple[0] == "start":
21 etWriter.begin(element)
22 else:
23 etWriter.end(element)
24 if eTuple[0] == "end" and element.tag in ["document", "corpus"]:
25 element.clear()
26 etWriter.close()
27
29 - def __init__(self, sentenceElement, parse=None, tokenization=None, removeIntersentenceInteractions=True, removeNameInfo=False, verbose=False):
30 self.sentence = sentenceElement
31 self.entities = []
32 self.entitiesById = {}
33 self.pairs = []
34 self.interactions = []
35 self.tokens = []
36 self.dependencies = []
37
38 self.parseElement = None
39 self.tokenizationElement = None
40
41 sentenceId = sentenceElement.get("id")
42 pairElements = sentenceElement.findall("pair")
43 if pairElements != None:
44 self.pairs = pairElements
45 if removeIntersentenceInteractions:
46 pairsToKeep = []
47 for pair in pairElements:
48 if pair.get("e1").rsplit(".",1)[0] == sentenceId and pair.get("e2").rsplit(".",1)[0] == sentenceId:
49 pairsToKeep.append(pair)
50 self.pairs = pairsToKeep
51
52 interactionElements = sentenceElement.findall("interaction")
53 if interactionElements != None:
54 self.interactions = interactionElements
55 self.interSentenceInteractions = []
56 if removeIntersentenceInteractions:
57 interactionsToKeep = []
58 for interaction in interactionElements:
59 e1rsplits = interaction.get("e1").count(".") - 2
60 e2rsplits = interaction.get("e2").count(".") - 2
61 if interaction.get("e1").rsplit(".",e1rsplits)[0] == sentenceId and interaction.get("e2").rsplit(".",e2rsplits)[0] == sentenceId:
62 interactionsToKeep.append(interaction)
63 else:
64 self.interSentenceInteractions.append(interaction)
65 self.interactions = interactionsToKeep
66
67 entityElements = sentenceElement.findall("entity")
68 if entityElements != None:
69 entitiesToKeep = []
70 for entityElement in entityElements:
71 if entityElement.get("type") != "neg":
72 entitiesToKeep.append(entityElement)
73 entityElements = entitiesToKeep
74 self.entities = entityElements
75 for entityElement in entityElements:
76 if removeNameInfo:
77 entityElement.set("isName","False")
78 self.entitiesById[entityElement.attrib["id"]] = entityElement
79
80 sentenceAnalysesElement = sentenceElement.find("sentenceanalyses")
81 analysesElement = sentenceElement.find("analyses")
82 assert sentenceAnalysesElement == None or analysesElement == None, sentenceId
83 if sentenceAnalysesElement == None:
84 sentenceAnalysesElement = analysesElement
85 if sentenceAnalysesElement != None:
86 parsesElement = None
87 if parse != None:
88
89
90 parseElements = [x for x in sentenceAnalysesElement.getiterator("parse")]
91
92 if len(parseElements) > 0:
93 self.parseElement = None
94 for element in parseElements:
95 if element.get("parser") == parse:
96 self.parseElement = element
97 break
98 if self.parseElement != None:
99 tokenization = self.parseElement.get("tokenizer")
100 tokenizationElements = [x for x in sentenceAnalysesElement.getiterator("tokenization")]
101
102
103 for element in tokenizationElements:
104 if element.get("tokenizer") == tokenization:
105 self.tokenizationElement = element
106 break
107 else:
108 if parse != None:
109 self.parseElement = parsesElement.find(parse)
110 if tokenization != None:
111 tokenizationsElement = sentenceAnalysesElement.find("tokenizations")
112 if tokenizationsElement != None:
113 self.tokenizationElement = tokenizationsElement.find(tokenization)
114
115 dependencyElements = None
116 if self.parseElement != None:
117 dependencyElements = self.parseElement.findall("dependency")
118 if dependencyElements != None:
119 self.dependencies = dependencyElements
120 else:
121 if verbose:
122 print >> sys.stderr, "Warning, parse", parse, "not found"
123 if self.tokenizationElement != None:
124 tokenElements = self.tokenizationElement.findall("token")
125 if tokenElements != None:
126 self.tokens = tokenElements
127 else:
128 if verbose:
129 print >> sys.stderr, "Warning, tokenization", tokenization, "not found"
130
131 - def getEntity(self, offset, offsetList, entityIds):
132 index = 0
133 for i in offsetList:
134 if (offset[0] >= i[0] and offset[0] <= i[1]) or (i[0] >= offset[0] and i[0] <= offset[1]):
135
136 return entityIds[index]
137 index += 1
138 return None
139
141 entityElements = self.entities
142 entityOffsets = []
143 entityOffsetIds = []
144 entityTokens = {}
145 for entityElement in entityElements:
146 if not entityTokens.has_key(entityElement.get("id")):
147 entityTokens[entityElement.get("id")] = []
148 offsets = entityElement.get("charOffset").split(",")
149 for i in offsets:
150 offset = i.split("-")
151 offset[0] = int(offset[0])
152 offset[1] = int(offset[1])
153 entityOffsets.append(offset)
154 entityOffsetIds.append(entityElement.get("id"))
155
156 for tokenElement in self.tokens:
157 offset = tokenElement.get("charOffset").split("-")
158 offset[0] = int(offset[0])
159 offset[1] = int(offset[1])
160 id = tokenElement.get("id")
161 entityId = self.getEntity(offset, entityOffsets, entityOffsetIds)
162 if not entityTokens.has_key(entityId):
163 entityTokens[entityId] = []
164 entityTokens[entityId].append(id)
165
166 return entityTokens
167