1  """ 
  2  Giuliano Feature Builder 
  3  """ 
  4  __version__ = "$Revision: 1.1 $" 
  5   
  6  import sys,os 
  7  from FeatureBuilder import FeatureBuilder 
  8  thisPath = os.path.dirname(os.path.abspath(__file__)) 
  9  sys.path.append(os.path.abspath(os.path.join(thisPath,"../.."))) 
 10  import Utils.Range as Range 
 11   
 14          """ 
 15          This is called, when the ExampleBuilder object is created. 
 16           
 17          @type featureSet: Core.IdSet 
 18          @param featureSet: The feature ids 
 19          """ 
 20          FeatureBuilder.__init__(self, featureSet) 
  21       
 23          """ 
 24          This function is called once for each sentence, before any calls to "buildFeatures". It 
 25          should be used to initialize per-sentence data structures. 
 26           
 27          @type sentenceGraph: Core.SentenceGraph 
 28          @param sentenceGraph: a SentenceGraph object providing access to the aligned semantic and syntactic 
 29                         information of the sentence. The underlying XML can also be accessed through 
 30                         this class. 
 31          """ 
 32           
 33          pass 
  34       
 36          """ 
 37          This is the main-function for feature generation. It is called once for each  
 38          directed entity pair in the sentence. 
 39           
 40          For defining features, please use the member function "setFeature(self, name, value=1)", 
 41          derived from the parent class. This ensures features get correctly tagged, if needed. 
 42           
 43          @type entity1: cElementTree.Element 
 44          @param entity1: First entity of the candidate edge, an Interaction XML "entity"-element 
 45          @type entity2: cElementTree.Element 
 46          @param entity2: Second entity of the candidate edge, an Interaction XML "entity"-element 
 47          @type token1: cElementTree.Element 
 48          @param token1: The head token of entity1, an Interaction XML "token"-element 
 49          @type token2: cElementTree.Element 
 50          @param token2: The head token of entity2, an Interaction XML "token"-element 
 51          @type path: list of cElementTree.Elements (when "no_path" style is set, this is always [token1, token2]) 
 52          @param path: the shortest connecting path of tokens (Interaction XML "token"-elements) 
 53          @type sentenceGraph: Core.SentenceGraph 
 54          @param sentenceGraph: a SentenceGraph object providing access to the aligned semantic and syntactic 
 55                         information of the sentence. The underlying XML can also be accessed through 
 56                         this class. 
 57          """ 
 58           
 59          self.sentenceGraph = sentenceGraph 
 60          patternForeBetween, patternBetween, patternBetweenAfter = self.getPatterns(entity1, entity2) 
 61          for feature in patternForeBetween: 
 62              self.setFeature("pFB_" + feature, patternForeBetween[feature]) 
 63          for feature in patternBetween: 
 64              self.setFeature("pB_" + feature, patternBetween[feature]) 
 65          for feature in patternBetweenAfter: 
 66              self.setFeature("pBA_" + feature, patternBetweenAfter[feature]) 
  67       
 69           
 70          self.sentenceGraph = sentenceGraph 
 71          patternForeBetween, patternBetween, patternBetweenAfter = self.getPatterns(token, token) 
 72          for feature in patternForeBetween: 
 73              self.setFeature("pFB_" + feature, patternForeBetween[feature]) 
 74          for feature in patternBetween: 
 75              self.setFeature("pB_" + feature, patternBetween[feature]) 
 76          for feature in patternBetweenAfter: 
 77              self.setFeature("pBA_" + feature, patternBetweenAfter[feature]) 
  78           
 79 -    def getGlobalContextKernel(self, patterns1, patterns2): 
  80          kernelFB = calculateKernel(patterns1["Fore-Between"], patterns2["Fore-Between"]) 
 81          kernelB = calculateKernel(patterns1["Between"], patterns2["Between"]) 
 82          kernelBA = calculateKernel(patterns1["Between-After"], patterns2["Between-After"]) 
 83          return kernelFB + kernelB + kernelBA 
  84   
 86          offset = Range.charOffsetToSingleTuple(token.get("charOffset")) 
 87          if Range.overlap(entity1Range, offset): 
 88              return "Entity1" 
 89          if Range.overlap(entity2Range, offset): 
 90              return "Entity2" 
 91          entitiesRange = (min(entity1Range[0],entity2Range[0]),max(entity1Range[1],entity2Range[1])) 
 92          if offset[1] < entitiesRange[0]: 
 93              return "Fore" 
 94          elif offset[1] > entitiesRange[1]: 
 95              return "After" 
 96          else: 
 97              return "Between" 
  98       
100          e1Range = Range.charOffsetToSingleTuple(e1.get("charOffset")) 
101          e2Range = Range.charOffsetToSingleTuple(e2.get("charOffset")) 
102           
103          tokenPositions = {} 
104          for token in self.sentenceGraph.tokens: 
105              tokenPositions[token.get("id")] = self.getRelativePosition(e1Range,e2Range,token) 
106           
107          prevTokenText = None 
108          prevToken2Text = None 
109          prevPosition = None 
110          patternForeBetween = {} 
111          patternBetween = {} 
112          patternBetweenAfter = {} 
113          for token in self.sentenceGraph.tokens: 
114              if self.sentenceGraph.tokenIsName[token]: 
115                  continue 
116                   
117              id = token.get("id") 
118              text = token.get("text").lower() 
119               
120              if prevPosition != tokenPositions[id]: 
121                  prevTokenText = None 
122                  prevToken2Text = None 
123               
124              if tokenPositions[id] == "Fore": 
125                  self.addToPattern(patternForeBetween, text, prevTokenText, prevToken2Text) 
126              elif tokenPositions[id] == "Between": 
127                  self.addToPattern(patternForeBetween, text, prevTokenText, prevToken2Text) 
128                  self.addToPattern(patternBetween, text, prevTokenText, prevToken2Text) 
129                  self.addToPattern(patternBetweenAfter, text, prevTokenText, prevToken2Text) 
130              elif tokenPositions[id] == "After": 
131                  self.addToPattern(patternBetweenAfter, text, prevTokenText, prevToken2Text) 
132               
133              prevPosition = tokenPositions[id] 
134               
135              prevToken2Text = prevTokenText 
136              prevTokenText = text 
137       
138          return patternForeBetween, patternBetween, patternBetweenAfter 
 139   
140 -    def addToPattern(self, pattern, tokenText, prevTokenText, prevToken2Text): 
 141          if not pattern.has_key(tokenText): 
142              pattern[tokenText] = 0 
143          pattern[tokenText] += 1 
144           
145           
146          if prevTokenText != None: 
147              ngram1 = prevTokenText + "_" + tokenText 
148              if not pattern.has_key(ngram1): 
149                  pattern[ngram1] = 0 
150              pattern[ngram1] += 1 
151           
152          if prevToken2Text != None: 
153              ngram2 = prevToken2Text + "_" + ngram1 
154              if not pattern.has_key(ngram2): 
155                  pattern[ngram2] = 0 
156              pattern[ngram2] += 1 
 157   
159          dotProduct = 0.0 
160          length1 = 0.0 
161          length2 = 0.0 
162           
163          for k,v in pattern1.iteritems(): 
164              if pattern2.has_key(k): 
165                 dotProduct += v * pattern2[k] 
166           
167          for v in pattern1.values(): 
168              length1 += v * v 
169          length1 = math.sqrt(length1) 
170           
171          for v in pattern2.values(): 
172              length2 += v * v 
173          length2 = math.sqrt(length2) 
174           
175          if length1 == 0 or length2 == 0: 
176              return 0.0 
177          else: 
178              return dotProduct / (length1 * length2)