1 """
2 Trigger examples
3 """
4 __version__ = "$Revision: 1.7 $"
5
6 import sys, os
7 thisPath = os.path.dirname(os.path.abspath(__file__))
8 sys.path.append(os.path.abspath(os.path.join(thisPath,"..")))
9 from ExampleBuilder import ExampleBuilder
10 import Utils.Libraries.PorterStemmer as PorterStemmer
11 from Core.IdSet import IdSet
12 import Core.ExampleUtils as ExampleUtils
13
14 import Utils.InteractionXML.MapPhrases as MapPhrases
15 import Utils.Settings as Settings
16 import Utils.Download
17 from FeatureBuilders.TriggerFeatureBuilder import TriggerFeatureBuilder
18
19 coNPPhraseFirstToken = set(["both", "each", "it", "its", "itself", "neither", "others",
20 "that", "the", "their", "them", "themselves", "these", "they",
21 "this", "those"])
22
23 -def installBBData(destPath=None, downloadPath=None, redownload=False, updateLocalSettings=False):
24 print >> sys.stderr, "---------------", "Downloading TEES data files for BB", "---------------"
25 print >> sys.stderr, "Bacteria tokens derived from LPSN (http://www.bacterio.cict.fr/)"
26 if destPath == None:
27 destPath = os.path.join(Settings.DATAPATH, "resources")
28 if downloadPath == None:
29 downloadPath = os.path.join(Settings.DATAPATH, "resources/download")
30 Utils.Download.downloadAndExtract(Settings.URL["TEES_RESOURCES"], destPath, downloadPath, redownload=redownload)
31 Settings.setLocal("TEES_RESOURCES", destPath, updateLocalSettings)
32
34 f = open(filename, "rt")
35 names = []
36 for line in f:
37 if line.strip == "":
38 continue
39 if line.startswith("Note:"):
40 continue
41 namePart = line.split("18")[0].split("19")[0].split("(")[0]
42 names.append(namePart)
43 f.close()
44 return names
45
63
65 - def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None):
78
79
80
81
82
83
84
85
86
87
88
89
90
92 """
93 Linear features are built by marking token features with a tag
94 that defines their relative position in the linear order.
95 """
96 tag = "linear_"+tag
97 tokenFeatures, tokenFeatureWeights = self.getTokenFeatures(sentenceGraph.tokens[index], sentenceGraph)
98 for tokenFeature in tokenFeatures:
99 features[self.featureSet.getId(tag+tokenFeature)] = tokenFeatureWeights[tokenFeature]
100
102 ngram = "ngram"
103 for token in phraseTokens:
104 ngram += "_" + sentenceGraph.getTokenText(token).lower()
105 features[self.featureSet.getId(ngram)] = 1
106
108 bestToken = (-9999, None)
109 for token in phraseTokens:
110 headScore = int(token.get("headScore"))
111 if headScore >= bestToken[0]:
112 bestToken = (headScore, token)
113 return bestToken[1]
114
116 phraseBegin = int(phrase.get("begin"))
117 phraseEnd = int(phrase.get("end"))
118 return sentenceGraph.tokens[phraseBegin:phraseEnd+1]
119
121 if phrase not in phraseToEntity:
122 return "neg"
123 entityTypes = set()
124 for entity in phraseToEntity[phrase]:
125 entityTypes.add(entity.get("type"))
126 return "---".join(sorted(list(entityTypes)))
127
129 global coNPPhraseFirstToken
130
131
132 if phrase.get("type") not in ["NP", "NP-IN"]:
133 return True
134
135 for token in phraseTokens:
136 if sentenceGraph.tokenIsName[token]:
137 return True
138
139 if phraseTokens[0].get("text") in coNPPhraseFirstToken:
140 return True
141 else:
142 return False
143
145 """
146 Build one example for each phrase in the sentence
147 """
148 self.triggerFeatureBuilder.initSentence(sentenceGraph)
149
150
151 exampleIndex = 0
152
153
154
155 phrases = MapPhrases.getPhrases(sentenceGraph.parseElement, sentenceGraph.tokens, set(["NP", "WHADVP", "WHNP"]))
156 phraseDict = MapPhrases.getPhraseDict(phrases)
157 phrases.extend( MapPhrases.makeINSubPhrases(phrases, sentenceGraph.tokens, phraseDict, ["NP"]) )
158 phrases.extend( MapPhrases.makeTokenSubPhrases(sentenceGraph.tokens, phraseDict) )
159 phraseToEntity = MapPhrases.getPhraseEntityMapping(sentenceGraph.entities, phraseDict)
160
161 phraseTypeCounts = MapPhrases.getPhraseTypeCounts(phrases)
162 for key in phraseTypeCounts.keys():
163 if not self.phraseTypeCounts.has_key(key):
164 self.phraseTypeCounts[key] = 0
165 self.phraseTypeCounts[key] += phraseTypeCounts[key]
166 self.exampleStats.addVariable("Phrase type counts", self.phraseTypeCounts)
167
168
169 for phrase in phrases:
170 features = {}
171 self.triggerFeatureBuilder.setFeatureVector(features)
172
173 categoryName = self.getCategoryName(phrase, phraseToEntity)
174 category = self.classSet.getId(categoryName)
175 phraseTokens = self.getPhraseTokens(phrase, sentenceGraph)
176 phraseHeadToken = self.getPhraseHeadToken(phrase, phraseTokens)
177 self.exampleStats.beginExample(categoryName)
178
179 if self.styles["co_limits"] and not self.isPotentialCOTrigger(phrase, phraseTokens, sentenceGraph):
180 self.exampleStats.filter("co_limits")
181 self.exampleStats.endExample()
182 continue
183
184
185 features.update(self.triggerFeatureBuilder.bowFeatures)
186
187
188 self.buildLinearNGram(phraseTokens, sentenceGraph, features)
189 features[self.featureSet.getId("pType_"+phrase.get("type"))] = 1
190 for split in phrase.get("type").split("-"):
191 features[self.featureSet.getId("pSubType_"+split)] = 1
192
193 nameCount = 0
194 for token in phraseTokens:
195 if sentenceGraph.tokenIsName[token]:
196 nameCount += 1
197 features[self.featureSet.getId("phraseNames_"+str(nameCount))] = 1
198 features[self.featureSet.getId("phraseNameCount")] = nameCount
199
200
201 self.triggerFeatureBuilder.setTag("head_")
202 self.triggerFeatureBuilder.buildFeatures(phraseHeadToken)
203 self.triggerFeatureBuilder.buildAttachedEdgeFeatures(phraseHeadToken, sentenceGraph)
204 self.triggerFeatureBuilder.setTag()
205
206
207 self.triggerFeatureBuilder.setTag("ptok_")
208 phraseTokenPos = 0
209
210 for token in phraseTokens:
211 self.triggerFeatureBuilder.setTag("ptok_")
212 self.triggerFeatureBuilder.buildFeatures(phraseHeadToken, linear=False, chains=False)
213 self.triggerFeatureBuilder.setTag("ptok_" + str(phraseTokenPos) + "_" )
214 self.triggerFeatureBuilder.buildFeatures(phraseHeadToken, linear=False, chains=False)
215 self.triggerFeatureBuilder.setTag("ptok_" + str(phraseTokenPos-len(phraseTokens)) + "_" )
216 self.triggerFeatureBuilder.buildFeatures(phraseHeadToken, linear=False, chains=False)
217
218 phraseTokenPos += 1
219 self.triggerFeatureBuilder.setTag()
220
221 extra = {"xtype":"phrase","t":phraseHeadToken.get("id"), "p":phrase.get("id"), "ptype":phrase.get("type")}
222 extra["charOffset"] = phrase.get("charOffset")
223 if phrase not in phraseToEntity:
224 extra["eids"] = "neg"
225 else:
226 extra["eids"] = ",".join([x.get("id") for x in phraseToEntity[phrase]])
227 example = (sentenceGraph.getSentenceId()+".x"+str(exampleIndex), category, features, extra)
228 ExampleUtils.appendExamples([example], outfile)
229 self.exampleStats.endExample()
230 exampleIndex += 1
231
232
233 linkedEntities = set( sum(phraseToEntity.values(), []) )
234 for entity in sentenceGraph.entities:
235 if entity.get("isName") != "True" and entity not in linkedEntities:
236 self.exampleStats.addValue("Entities with no phrase", 1)
237
238
239
240
241 return exampleIndex
242