Package TEES :: Package ExampleBuilders :: Module PhraseTriggerExampleBuilder
[hide private]

Source Code for Module TEES.ExampleBuilders.PhraseTriggerExampleBuilder

  1  """ 
  2  Trigger examples 
  3  """ 
  4  __version__ = "$Revision: 1.7 $" 
  5   
  6  import sys, os 
  7  thisPath = os.path.dirname(os.path.abspath(__file__)) 
  8  sys.path.append(os.path.abspath(os.path.join(thisPath,".."))) 
  9  from ExampleBuilder import ExampleBuilder 
 10  import Utils.Libraries.PorterStemmer as PorterStemmer 
 11  from Core.IdSet import IdSet 
 12  import Core.ExampleUtils as ExampleUtils 
 13  #from Core.Gazetteer import Gazetteer 
 14  import Utils.InteractionXML.MapPhrases as MapPhrases 
 15  import Utils.Settings as Settings 
 16  import Utils.Download 
 17  from FeatureBuilders.TriggerFeatureBuilder import TriggerFeatureBuilder 
 18   
 19  coNPPhraseFirstToken = set(["both", "each", "it", "its", "itself", "neither", "others", 
 20                              "that", "the", "their", "them", "themselves", "these", "they", 
 21                              "this", "those"]) 
 22   
23 -def installBBData(destPath=None, downloadPath=None, redownload=False, updateLocalSettings=False):
24 print >> sys.stderr, "---------------", "Downloading TEES data files for BB", "---------------" 25 print >> sys.stderr, "Bacteria tokens derived from LPSN (http://www.bacterio.cict.fr/)" 26 if destPath == None: 27 destPath = os.path.join(Settings.DATAPATH, "resources") 28 if downloadPath == None: 29 downloadPath = os.path.join(Settings.DATAPATH, "resources/download") 30 Utils.Download.downloadAndExtract(Settings.URL["TEES_RESOURCES"], destPath, downloadPath, redownload=redownload) 31 Settings.setLocal("TEES_RESOURCES", destPath, updateLocalSettings)
32
33 -def getBacteriaNames(filename):
34 f = open(filename, "rt") 35 names = [] 36 for line in f: 37 if line.strip == "": 38 continue 39 if line.startswith("Note:"): 40 continue 41 namePart = line.split("18")[0].split("19")[0].split("(")[0] 42 names.append(namePart) 43 f.close() 44 return names
45
46 -def getBacteriaTokens(names=None):
47 # Install file if needed 48 if not hasattr(Settings, "TEES_RESOURCES"): 49 print >> sys.stderr, "TEES example builder data files not installed, installing now" 50 installBBData(updateLocalSettings=True) 51 # Get the tokens 52 tokens = set() 53 if names != None: 54 for name in names: 55 for split in name.split(): 56 tokens.add(split.lower()) 57 else: 58 f = open(os.path.join(Settings.TEES_RESOURCES, "bacteria-tokens.txt"), "rt") 59 for line in f: 60 tokens.add(line.strip()) 61 f.close() 62 return tokens
63
64 -class PhraseTriggerExampleBuilder(ExampleBuilder):
65 - def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None):
66 if classSet == None: 67 classSet = IdSet(1) 68 assert( classSet.getId("neg") == 1 ) 69 if featureSet == None: 70 featureSet = IdSet() 71 ExampleBuilder.__init__(self, classSet, featureSet) 72 73 self._setDefaultParameters(["co_limits"]) 74 self.styles = self.getParameters(style) 75 self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet) 76 self.triggerFeatureBuilder.useNonNameEntities = False 77 self.phraseTypeCounts = {}
78 79 # @classmethod 80 # def run(cls, input, output, parse, tokenization, style, idFileTag=None, gazetteerFileName=None): 81 # classSet, featureSet = cls.getIdSets(idFileTag) 82 # e = PhraseTriggerExampleBuilder(style, classSet, featureSet) 83 # if "names" in style: 84 # sentences = cls.getSentences(input, parse, tokenization, removeNameInfo=True) 85 # else: 86 # sentences = cls.getSentences(input, parse, tokenization, removeNameInfo=False) 87 # e.phraseTypeCounts = {} 88 # e.buildExamplesForSentences(sentences, output, idFileTag) 89 # print >> sys.stderr, "Phrase type counts:", e.phraseTypeCounts 90
91 - def buildLinearOrderFeatures(self,sentenceGraph,index,tag,features):
92 """ 93 Linear features are built by marking token features with a tag 94 that defines their relative position in the linear order. 95 """ 96 tag = "linear_"+tag 97 tokenFeatures, tokenFeatureWeights = self.getTokenFeatures(sentenceGraph.tokens[index], sentenceGraph) 98 for tokenFeature in tokenFeatures: 99 features[self.featureSet.getId(tag+tokenFeature)] = tokenFeatureWeights[tokenFeature]
100
101 - def buildLinearNGram(self, phraseTokens, sentenceGraph, features):
102 ngram = "ngram" 103 for token in phraseTokens: 104 ngram += "_" + sentenceGraph.getTokenText(token).lower() 105 features[self.featureSet.getId(ngram)] = 1
106
107 - def getPhraseHeadToken(self, phrase, phraseTokens):
108 bestToken = (-9999, None) 109 for token in phraseTokens: 110 headScore = int(token.get("headScore")) 111 if headScore >= bestToken[0]: # >= because rightmost is best 112 bestToken = (headScore, token) 113 return bestToken[1]
114
115 - def getPhraseTokens(self, phrase, sentenceGraph):
116 phraseBegin = int(phrase.get("begin")) 117 phraseEnd = int(phrase.get("end")) 118 return sentenceGraph.tokens[phraseBegin:phraseEnd+1]
119
120 - def getCategoryName(self, phrase, phraseToEntity):
121 if phrase not in phraseToEntity: 122 return "neg" 123 entityTypes = set() 124 for entity in phraseToEntity[phrase]: 125 entityTypes.add(entity.get("type")) 126 return "---".join(sorted(list(entityTypes)))
127
128 - def isPotentialCOTrigger(self, phrase, phraseTokens, sentenceGraph):
129 global coNPPhraseFirstToken 130 131 # Check type 132 if phrase.get("type") not in ["NP", "NP-IN"]: # only limit these types 133 return True 134 # Check named entities 135 for token in phraseTokens: 136 if sentenceGraph.tokenIsName[token]: 137 return True 138 # Check first word 139 if phraseTokens[0].get("text") in coNPPhraseFirstToken: 140 return True 141 else: 142 return False
143
144 - def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None):
145 """ 146 Build one example for each phrase in the sentence 147 """ 148 self.triggerFeatureBuilder.initSentence(sentenceGraph) 149 150 #examples = [] 151 exampleIndex = 0 152 153 # Prepare phrases, create subphrases 154 #filter = set(["NP", "TOK-IN", "WHADVP", "WHNP", "TOK-WP$", "TOK-PRP$", "NP-IN"]) 155 phrases = MapPhrases.getPhrases(sentenceGraph.parseElement, sentenceGraph.tokens, set(["NP", "WHADVP", "WHNP"])) 156 phraseDict = MapPhrases.getPhraseDict(phrases) 157 phrases.extend( MapPhrases.makeINSubPhrases(phrases, sentenceGraph.tokens, phraseDict, ["NP"]) ) 158 phrases.extend( MapPhrases.makeTokenSubPhrases(sentenceGraph.tokens, phraseDict) ) 159 phraseToEntity = MapPhrases.getPhraseEntityMapping(sentenceGraph.entities, phraseDict) 160 # Make counts 161 phraseTypeCounts = MapPhrases.getPhraseTypeCounts(phrases) 162 for key in phraseTypeCounts.keys(): 163 if not self.phraseTypeCounts.has_key(key): 164 self.phraseTypeCounts[key] = 0 165 self.phraseTypeCounts[key] += phraseTypeCounts[key] 166 self.exampleStats.addVariable("Phrase type counts", self.phraseTypeCounts) # can be added on each loop, will always point to the same thing 167 168 # Build one example for each phrase 169 for phrase in phrases: 170 features = {} 171 self.triggerFeatureBuilder.setFeatureVector(features) 172 173 categoryName = self.getCategoryName(phrase, phraseToEntity) 174 category = self.classSet.getId(categoryName) 175 phraseTokens = self.getPhraseTokens(phrase, sentenceGraph) 176 phraseHeadToken = self.getPhraseHeadToken(phrase, phraseTokens) 177 self.exampleStats.beginExample(categoryName) 178 179 if self.styles["co_limits"] and not self.isPotentialCOTrigger(phrase, phraseTokens, sentenceGraph): 180 self.exampleStats.filter("co_limits") 181 self.exampleStats.endExample() 182 continue 183 184 # Sentence level features 185 features.update(self.triggerFeatureBuilder.bowFeatures) 186 187 # Whole phrase features 188 self.buildLinearNGram(phraseTokens, sentenceGraph, features) 189 features[self.featureSet.getId("pType_"+phrase.get("type"))] = 1 190 for split in phrase.get("type").split("-"): 191 features[self.featureSet.getId("pSubType_"+split)] = 1 192 # Check named entities 193 nameCount = 0 194 for token in phraseTokens: 195 if sentenceGraph.tokenIsName[token]: 196 nameCount += 1 197 features[self.featureSet.getId("phraseNames_"+str(nameCount))] = 1 198 features[self.featureSet.getId("phraseNameCount")] = nameCount 199 200 # Head token features 201 self.triggerFeatureBuilder.setTag("head_") 202 self.triggerFeatureBuilder.buildFeatures(phraseHeadToken) 203 self.triggerFeatureBuilder.buildAttachedEdgeFeatures(phraseHeadToken, sentenceGraph) 204 self.triggerFeatureBuilder.setTag() 205 206 # Features for all phrase tokens 207 self.triggerFeatureBuilder.setTag("ptok_") 208 phraseTokenPos = 0 209 #print len(phraseTokens) 210 for token in phraseTokens: 211 self.triggerFeatureBuilder.setTag("ptok_") 212 self.triggerFeatureBuilder.buildFeatures(phraseHeadToken, linear=False, chains=False) 213 self.triggerFeatureBuilder.setTag("ptok_" + str(phraseTokenPos) + "_" ) 214 self.triggerFeatureBuilder.buildFeatures(phraseHeadToken, linear=False, chains=False) 215 self.triggerFeatureBuilder.setTag("ptok_" + str(phraseTokenPos-len(phraseTokens)) + "_" ) 216 self.triggerFeatureBuilder.buildFeatures(phraseHeadToken, linear=False, chains=False) 217 #self.triggerFeatureBuilder.buildAttachedEdgeFeatures(phraseHeadToken) 218 phraseTokenPos += 1 219 self.triggerFeatureBuilder.setTag() 220 221 extra = {"xtype":"phrase","t":phraseHeadToken.get("id"), "p":phrase.get("id"), "ptype":phrase.get("type")} 222 extra["charOffset"] = phrase.get("charOffset") 223 if phrase not in phraseToEntity: 224 extra["eids"] = "neg" 225 else: 226 extra["eids"] = ",".join([x.get("id") for x in phraseToEntity[phrase]]) 227 example = (sentenceGraph.getSentenceId()+".x"+str(exampleIndex), category, features, extra) 228 ExampleUtils.appendExamples([example], outfile) 229 self.exampleStats.endExample() 230 exampleIndex += 1 231 232 # Mark missed entities in exampleStats 233 linkedEntities = set( sum(phraseToEntity.values(), []) ) 234 for entity in sentenceGraph.entities: 235 if entity.get("isName") != "True" and entity not in linkedEntities: 236 self.exampleStats.addValue("Entities with no phrase", 1) 237 # Marking these as filtered examples was misleading, as examples are per phrase, and these are entities 238 #self.exampleStats.beginExample(entity.get("type")) 239 #self.exampleStats.filter("no_phrase") 240 #self.exampleStats.endExample() 241 return exampleIndex
242