1 """
2 Giuliano Feature Builder
3 """
4 __version__ = "$Revision: 1.1 $"
5
6 import sys,os
7 from FeatureBuilder import FeatureBuilder
8 thisPath = os.path.dirname(os.path.abspath(__file__))
9 sys.path.append(os.path.abspath(os.path.join(thisPath,"../..")))
10 import Utils.Range as Range
11
14 """
15 This is called, when the ExampleBuilder object is created.
16
17 @type featureSet: Core.IdSet
18 @param featureSet: The feature ids
19 """
20 FeatureBuilder.__init__(self, featureSet)
21
23 """
24 This function is called once for each sentence, before any calls to "buildFeatures". It
25 should be used to initialize per-sentence data structures.
26
27 @type sentenceGraph: Core.SentenceGraph
28 @param sentenceGraph: a SentenceGraph object providing access to the aligned semantic and syntactic
29 information of the sentence. The underlying XML can also be accessed through
30 this class.
31 """
32
33 pass
34
36 """
37 This is the main-function for feature generation. It is called once for each
38 directed entity pair in the sentence.
39
40 For defining features, please use the member function "setFeature(self, name, value=1)",
41 derived from the parent class. This ensures features get correctly tagged, if needed.
42
43 @type entity1: cElementTree.Element
44 @param entity1: First entity of the candidate edge, an Interaction XML "entity"-element
45 @type entity2: cElementTree.Element
46 @param entity2: Second entity of the candidate edge, an Interaction XML "entity"-element
47 @type token1: cElementTree.Element
48 @param token1: The head token of entity1, an Interaction XML "token"-element
49 @type token2: cElementTree.Element
50 @param token2: The head token of entity2, an Interaction XML "token"-element
51 @type path: list of cElementTree.Elements (when "no_path" style is set, this is always [token1, token2])
52 @param path: the shortest connecting path of tokens (Interaction XML "token"-elements)
53 @type sentenceGraph: Core.SentenceGraph
54 @param sentenceGraph: a SentenceGraph object providing access to the aligned semantic and syntactic
55 information of the sentence. The underlying XML can also be accessed through
56 this class.
57 """
58
59 self.sentenceGraph = sentenceGraph
60 patternForeBetween, patternBetween, patternBetweenAfter = self.getPatterns(entity1, entity2)
61 for feature in patternForeBetween:
62 self.setFeature("pFB_" + feature, patternForeBetween[feature])
63 for feature in patternBetween:
64 self.setFeature("pB_" + feature, patternBetween[feature])
65 for feature in patternBetweenAfter:
66 self.setFeature("pBA_" + feature, patternBetweenAfter[feature])
67
69
70 self.sentenceGraph = sentenceGraph
71 patternForeBetween, patternBetween, patternBetweenAfter = self.getPatterns(token, token)
72 for feature in patternForeBetween:
73 self.setFeature("pFB_" + feature, patternForeBetween[feature])
74 for feature in patternBetween:
75 self.setFeature("pB_" + feature, patternBetween[feature])
76 for feature in patternBetweenAfter:
77 self.setFeature("pBA_" + feature, patternBetweenAfter[feature])
78
79 - def getGlobalContextKernel(self, patterns1, patterns2):
80 kernelFB = calculateKernel(patterns1["Fore-Between"], patterns2["Fore-Between"])
81 kernelB = calculateKernel(patterns1["Between"], patterns2["Between"])
82 kernelBA = calculateKernel(patterns1["Between-After"], patterns2["Between-After"])
83 return kernelFB + kernelB + kernelBA
84
86 offset = Range.charOffsetToSingleTuple(token.get("charOffset"))
87 if Range.overlap(entity1Range, offset):
88 return "Entity1"
89 if Range.overlap(entity2Range, offset):
90 return "Entity2"
91 entitiesRange = (min(entity1Range[0],entity2Range[0]),max(entity1Range[1],entity2Range[1]))
92 if offset[1] < entitiesRange[0]:
93 return "Fore"
94 elif offset[1] > entitiesRange[1]:
95 return "After"
96 else:
97 return "Between"
98
100 e1Range = Range.charOffsetToSingleTuple(e1.get("charOffset"))
101 e2Range = Range.charOffsetToSingleTuple(e2.get("charOffset"))
102
103 tokenPositions = {}
104 for token in self.sentenceGraph.tokens:
105 tokenPositions[token.get("id")] = self.getRelativePosition(e1Range,e2Range,token)
106
107 prevTokenText = None
108 prevToken2Text = None
109 prevPosition = None
110 patternForeBetween = {}
111 patternBetween = {}
112 patternBetweenAfter = {}
113 for token in self.sentenceGraph.tokens:
114 if self.sentenceGraph.tokenIsName[token]:
115 continue
116
117 id = token.get("id")
118 text = token.get("text").lower()
119
120 if prevPosition != tokenPositions[id]:
121 prevTokenText = None
122 prevToken2Text = None
123
124 if tokenPositions[id] == "Fore":
125 self.addToPattern(patternForeBetween, text, prevTokenText, prevToken2Text)
126 elif tokenPositions[id] == "Between":
127 self.addToPattern(patternForeBetween, text, prevTokenText, prevToken2Text)
128 self.addToPattern(patternBetween, text, prevTokenText, prevToken2Text)
129 self.addToPattern(patternBetweenAfter, text, prevTokenText, prevToken2Text)
130 elif tokenPositions[id] == "After":
131 self.addToPattern(patternBetweenAfter, text, prevTokenText, prevToken2Text)
132
133 prevPosition = tokenPositions[id]
134
135 prevToken2Text = prevTokenText
136 prevTokenText = text
137
138 return patternForeBetween, patternBetween, patternBetweenAfter
139
140 - def addToPattern(self, pattern, tokenText, prevTokenText, prevToken2Text):
141 if not pattern.has_key(tokenText):
142 pattern[tokenText] = 0
143 pattern[tokenText] += 1
144
145
146 if prevTokenText != None:
147 ngram1 = prevTokenText + "_" + tokenText
148 if not pattern.has_key(ngram1):
149 pattern[ngram1] = 0
150 pattern[ngram1] += 1
151
152 if prevToken2Text != None:
153 ngram2 = prevToken2Text + "_" + ngram1
154 if not pattern.has_key(ngram2):
155 pattern[ngram2] = 0
156 pattern[ngram2] += 1
157
159 dotProduct = 0.0
160 length1 = 0.0
161 length2 = 0.0
162
163 for k,v in pattern1.iteritems():
164 if pattern2.has_key(k):
165 dotProduct += v * pattern2[k]
166
167 for v in pattern1.values():
168 length1 += v * v
169 length1 = math.sqrt(length1)
170
171 for v in pattern2.values():
172 length2 += v * v
173 length2 = math.sqrt(length2)
174
175 if length1 == 0 or length2 == 0:
176 return 0.0
177 else:
178 return dotProduct / (length1 * length2)
179
180 if __name__=="__main__":
181 """
182 The main-function is the test program for the EVEX feature builder. It takes as a parameter an
183 Interaction XML corpus file, and builds edge-examples using MultiEdgeExampleBuilder. When the
184 "evex" style parameter is set, MultiEdgeExampleBuilder will call EVEXFeatureBuilder for feature
185 generation.
186 """
187 import sys
188 sys.path.append("../..")
189 from Core.IdSet import IdSet
190 import Core.ExampleUtils as ExampleUtils
191 from ExampleBuilders.MultiEdgeExampleBuilder import MultiEdgeExampleBuilder
192
193
194 try:
195 import psyco
196 psyco.full()
197 print >> sys.stderr, "Found Psyco, using"
198 except ImportError:
199 print >> sys.stderr, "Psyco not installed"
200
201 from optparse import OptionParser
202 optparser = OptionParser(usage="%prog [options]\nTest EVEX Feature Builder.")
203 defaultInput = "/usr/share/biotext/BioNLP2011/data/main-tasks/GE/GE-devel-nodup.xml"
204 optparser.add_option("-i", "--input", default=defaultInput, dest="input", help="Corpus in analysis format", metavar="FILE")
205 optparser.add_option("-o", "--output", default="evex-examples.txt", dest="output", help="Output feature file")
206 optparser.add_option("-d", "--edgeIds", default="evex-ids", dest="edgeIds", help="Example class and feature id file stem (files = STEM.class_names and STEM.feature_names)")
207 optparser.add_option("-t", "--tokenization", default="split-mccc-preparsed", dest="tokenization", help="tokenization")
208 optparser.add_option("-p", "--parse", default="split-mccc-preparsed", dest="parse", help="parse")
209 optparser.add_option("-s", "--styles", default="typed,directed,no_path,no_task,no_dependency,no_linear,entities,genia_limits,noMasking,maxFeatures,evex", dest="edgeStyles", help="")
210 (options, args) = optparser.parse_args()
211 assert options.input != None
212 assert options.output != None
213 assert options.edgeIds != None
214
215 exampleBuilder = MultiEdgeExampleBuilder()
216 exampleBuilder.run(options.input, options.output, options.parse, options.tokenization, "style:"+options.edgeStyles, options.edgeIds)
217