Package TEES :: Package ExampleBuilders :: Package FeatureBuilders :: Module GiulianoFeatureBuilder
[hide private]

Source Code for Module TEES.ExampleBuilders.FeatureBuilders.GiulianoFeatureBuilder

  1  """ 
  2  Giuliano Feature Builder 
  3  """ 
  4  __version__ = "$Revision: 1.1 $" 
  5   
  6  import sys,os 
  7  from FeatureBuilder import FeatureBuilder 
  8  thisPath = os.path.dirname(os.path.abspath(__file__)) 
  9  sys.path.append(os.path.abspath(os.path.join(thisPath,"../.."))) 
 10  import Utils.Range as Range 
 11   
12 -class GiulianoFeatureBuilder(FeatureBuilder):
13 - def __init__(self, featureSet):
14 """ 15 This is called, when the ExampleBuilder object is created. 16 17 @type featureSet: Core.IdSet 18 @param featureSet: The feature ids 19 """ 20 FeatureBuilder.__init__(self, featureSet)
21
22 - def initSentence(self, sentenceGraph):
23 """ 24 This function is called once for each sentence, before any calls to "buildFeatures". It 25 should be used to initialize per-sentence data structures. 26 27 @type sentenceGraph: Core.SentenceGraph 28 @param sentenceGraph: a SentenceGraph object providing access to the aligned semantic and syntactic 29 information of the sentence. The underlying XML can also be accessed through 30 this class. 31 """ 32 ### Sentence initialization code here ### 33 pass
34
35 - def buildEdgeFeatures(self, entity1, entity2, token1, token2, path, sentenceGraph):
36 """ 37 This is the main-function for feature generation. It is called once for each 38 directed entity pair in the sentence. 39 40 For defining features, please use the member function "setFeature(self, name, value=1)", 41 derived from the parent class. This ensures features get correctly tagged, if needed. 42 43 @type entity1: cElementTree.Element 44 @param entity1: First entity of the candidate edge, an Interaction XML "entity"-element 45 @type entity2: cElementTree.Element 46 @param entity2: Second entity of the candidate edge, an Interaction XML "entity"-element 47 @type token1: cElementTree.Element 48 @param token1: The head token of entity1, an Interaction XML "token"-element 49 @type token2: cElementTree.Element 50 @param token2: The head token of entity2, an Interaction XML "token"-element 51 @type path: list of cElementTree.Elements (when "no_path" style is set, this is always [token1, token2]) 52 @param path: the shortest connecting path of tokens (Interaction XML "token"-elements) 53 @type sentenceGraph: Core.SentenceGraph 54 @param sentenceGraph: a SentenceGraph object providing access to the aligned semantic and syntactic 55 information of the sentence. The underlying XML can also be accessed through 56 this class. 57 """ 58 ### Feature generation code here ### 59 self.sentenceGraph = sentenceGraph 60 patternForeBetween, patternBetween, patternBetweenAfter = self.getPatterns(entity1, entity2) 61 for feature in patternForeBetween: 62 self.setFeature("pFB_" + feature, patternForeBetween[feature]) 63 for feature in patternBetween: 64 self.setFeature("pB_" + feature, patternBetween[feature]) 65 for feature in patternBetweenAfter: 66 self.setFeature("pBA_" + feature, patternBetweenAfter[feature])
67
68 - def buildTriggerFeatures(self, token, sentenceGraph):
69 ### Feature generation code here ### 70 self.sentenceGraph = sentenceGraph 71 patternForeBetween, patternBetween, patternBetweenAfter = self.getPatterns(token, token) 72 for feature in patternForeBetween: 73 self.setFeature("pFB_" + feature, patternForeBetween[feature]) 74 for feature in patternBetween: 75 self.setFeature("pB_" + feature, patternBetween[feature]) 76 for feature in patternBetweenAfter: 77 self.setFeature("pBA_" + feature, patternBetweenAfter[feature])
78
79 - def getGlobalContextKernel(self, patterns1, patterns2):
80 kernelFB = calculateKernel(patterns1["Fore-Between"], patterns2["Fore-Between"]) 81 kernelB = calculateKernel(patterns1["Between"], patterns2["Between"]) 82 kernelBA = calculateKernel(patterns1["Between-After"], patterns2["Between-After"]) 83 return kernelFB + kernelB + kernelBA
84
85 - def getRelativePosition(self, entity1Range, entity2Range, token):
86 offset = Range.charOffsetToSingleTuple(token.get("charOffset")) 87 if Range.overlap(entity1Range, offset): 88 return "Entity1" 89 if Range.overlap(entity2Range, offset): 90 return "Entity2" 91 entitiesRange = (min(entity1Range[0],entity2Range[0]),max(entity1Range[1],entity2Range[1])) 92 if offset[1] < entitiesRange[0]: 93 return "Fore" 94 elif offset[1] > entitiesRange[1]: 95 return "After" 96 else: 97 return "Between"
98
99 - def getPatterns(self, e1, e2):
100 e1Range = Range.charOffsetToSingleTuple(e1.get("charOffset")) 101 e2Range = Range.charOffsetToSingleTuple(e2.get("charOffset")) 102 103 tokenPositions = {} 104 for token in self.sentenceGraph.tokens: 105 tokenPositions[token.get("id")] = self.getRelativePosition(e1Range,e2Range,token) 106 107 prevTokenText = None 108 prevToken2Text = None 109 prevPosition = None 110 patternForeBetween = {} 111 patternBetween = {} 112 patternBetweenAfter = {} 113 for token in self.sentenceGraph.tokens: 114 if self.sentenceGraph.tokenIsName[token]: 115 continue 116 117 id = token.get("id") 118 text = token.get("text").lower() 119 120 if prevPosition != tokenPositions[id]: 121 prevTokenText = None 122 prevToken2Text = None 123 124 if tokenPositions[id] == "Fore": 125 self.addToPattern(patternForeBetween, text, prevTokenText, prevToken2Text) 126 elif tokenPositions[id] == "Between": 127 self.addToPattern(patternForeBetween, text, prevTokenText, prevToken2Text) 128 self.addToPattern(patternBetween, text, prevTokenText, prevToken2Text) 129 self.addToPattern(patternBetweenAfter, text, prevTokenText, prevToken2Text) 130 elif tokenPositions[id] == "After": 131 self.addToPattern(patternBetweenAfter, text, prevTokenText, prevToken2Text) 132 133 prevPosition = tokenPositions[id] 134 #if tokenPositions[id].find("Entity") != -1: 135 prevToken2Text = prevTokenText 136 prevTokenText = text 137 138 return patternForeBetween, patternBetween, patternBetweenAfter
139
140 - def addToPattern(self, pattern, tokenText, prevTokenText, prevToken2Text):
141 if not pattern.has_key(tokenText): 142 pattern[tokenText] = 0 143 pattern[tokenText] += 1 144 145 # Should the n-grams be unordered? 146 if prevTokenText != None: 147 ngram1 = prevTokenText + "_" + tokenText 148 if not pattern.has_key(ngram1): 149 pattern[ngram1] = 0 150 pattern[ngram1] += 1 151 152 if prevToken2Text != None: 153 ngram2 = prevToken2Text + "_" + ngram1 154 if not pattern.has_key(ngram2): 155 pattern[ngram2] = 0 156 pattern[ngram2] += 1
157
158 - def calculateKernel(self, pattern1, pattern2):
159 dotProduct = 0.0 160 length1 = 0.0 161 length2 = 0.0 162 # The dotProduct is the numerator 163 for k,v in pattern1.iteritems(): 164 if pattern2.has_key(k): 165 dotProduct += v * pattern2[k] 166 # Get the length of the first vector 167 for v in pattern1.values(): 168 length1 += v * v 169 length1 = math.sqrt(length1) 170 # Get the length of the second vector 171 for v in pattern2.values(): 172 length2 += v * v 173 length2 = math.sqrt(length2) 174 175 if length1 == 0 or length2 == 0: 176 return 0.0 177 else: 178 return dotProduct / (length1 * length2)
179 180 if __name__=="__main__": 181 """ 182 The main-function is the test program for the EVEX feature builder. It takes as a parameter an 183 Interaction XML corpus file, and builds edge-examples using MultiEdgeExampleBuilder. When the 184 "evex" style parameter is set, MultiEdgeExampleBuilder will call EVEXFeatureBuilder for feature 185 generation. 186 """ 187 import sys 188 sys.path.append("../..") 189 from Core.IdSet import IdSet 190 import Core.ExampleUtils as ExampleUtils 191 from ExampleBuilders.MultiEdgeExampleBuilder import MultiEdgeExampleBuilder 192 193 # Import Psyco if available 194 try: 195 import psyco 196 psyco.full() 197 print >> sys.stderr, "Found Psyco, using" 198 except ImportError: 199 print >> sys.stderr, "Psyco not installed" 200 201 from optparse import OptionParser 202 optparser = OptionParser(usage="%prog [options]\nTest EVEX Feature Builder.") 203 defaultInput = "/usr/share/biotext/BioNLP2011/data/main-tasks/GE/GE-devel-nodup.xml" 204 optparser.add_option("-i", "--input", default=defaultInput, dest="input", help="Corpus in analysis format", metavar="FILE") 205 optparser.add_option("-o", "--output", default="evex-examples.txt", dest="output", help="Output feature file") 206 optparser.add_option("-d", "--edgeIds", default="evex-ids", dest="edgeIds", help="Example class and feature id file stem (files = STEM.class_names and STEM.feature_names)") 207 optparser.add_option("-t", "--tokenization", default="split-mccc-preparsed", dest="tokenization", help="tokenization") 208 optparser.add_option("-p", "--parse", default="split-mccc-preparsed", dest="parse", help="parse") 209 optparser.add_option("-s", "--styles", default="typed,directed,no_path,no_task,no_dependency,no_linear,entities,genia_limits,noMasking,maxFeatures,evex", dest="edgeStyles", help="") 210 (options, args) = optparser.parse_args() 211 assert options.input != None 212 assert options.output != None 213 assert options.edgeIds != None 214 215 exampleBuilder = MultiEdgeExampleBuilder() 216 exampleBuilder.run(options.input, options.output, options.parse, options.tokenization, "style:"+options.edgeStyles, options.edgeIds) 217