1 """
2 Program: Classes for working with parse graphs
3 Date: Jan. 31, 2008
4 Author: Jari Bjorne
5
6 Description: This file contains classes and functions for working with
7 sentence parse graphs. A parse graph can be created from interaction xml.
8 The nodes can be modified and assigned attributes like weight, and the
9 result can be converted into an adjacency matrix. The parse graph can
10 also be used to generate different paths between its tokens.
11
12 Status: All classes and methods should be working.
13
14 Dependencies: PorterStemmer (CommonUtils/Stemming)
15 Range.py (CommonUtils)
16 """
17
18 import Utils.Libraries.PorterStemmer as stemmer
19 import Utils.Range as Range
20 import sys
21
23 """
24 Represents a single node (either token or dependency) in the parse graph
25 or a path generated by a ParseGraph-object.
26 """
28 self.isDependency = isDependency
29
30 self.pos = None
31 self.text = None
32 self.id = None
33 self.charOffset = None
34 self.dependencies = []
35 self.entities = []
36
37 self.dependencyType = None
38 self.fro = None
39 self.to = None
40
41 - def toString(self, showPos=False, highlight=False):
42 string = ""
43 if self.isDependency:
44 string = "-" + str(self.dependencyType) + "-"
45 else:
46 if showPos:
47 string = "[" + self.pos + "]"
48 else:
49 string = "[" + self.text + "]"
50 if highlight:
51 string = "{" + string[1:-1] + "}"
52 return string
53
55 """
56 A ParseGraph-object consists of tokens and dependencies. A dependency connects
57 two tokens with two edges (token->dependency->token, where -> = edge).
58 """
59 - def __init__(self, tokenElements, dependencyElements, mergeDependencies=False):
60 self.tokensById, self.dependenciesById = self.buildParseGraph(tokenElements, dependencyElements, mergeDependencies)
61
62
63
64
65
66 - def buildParseGraph(self, tokenElements, dependencyElements, mergeDependencies=False):
67 """ Returns dictionaries containing tokens and dependencies
68 of the graph generated from ElementTree-elements.
69 """
70 tokensById = {}
71 dependenciesById = {}
72 for tokenElement in tokenElements:
73 node = ParseGraphNode()
74 node.id = int(tokenElement.attrib["id"].split("_")[1])
75 node.pos = tokenElement.attrib["POS"]
76 node.text = tokenElement.attrib["text"]
77 charFrom, charTo = tokenElement.attrib["charOffset"].split("-")
78 node.charOffset = (int(charFrom), int(charTo))
79 tokensById[node.id] = node
80
81
82 dependencyIndex = len(tokensById) + 99
83 if mergeDependencies:
84 dependenciesByFroAndTo = {}
85 for dependencyElement in dependencyElements:
86 dependency = ParseGraphNode(True)
87 dependency.dependencyType = dependencyElement.attrib["type"]
88 dependency.fro = tokensById[int(dependencyElement.attrib["t1"].split("_")[1])]
89 dependency.to = tokensById[int(dependencyElement.attrib["t2"].split("_")[1])]
90
91 if mergeDependencies:
92 key = (dependency.fro.id, dependency.to.id)
93 if dependenciesByFroAndTo.has_key(key):
94 if not type(dependenciesByFroAndTo[key].dependencyType) == types.ListType:
95 dependenciesByFroAndTo[key].dependencyType = [dependenciesByFroAndTo[key].dependencyType]
96 dependenciesByFroAndTo[key].dependencyType.append(dependency.dependencyType)
97 else:
98 dependenciesByFroAndTo[key] = dependency
99 tokensById[dependency.fro.id].dependencies.append(dependency)
100 tokensById[dependency.to.id].dependencies.append(dependency)
101 dependency.id = dependencyIndex
102 assert( not dependenciesById.has_key(dependency.id) )
103 dependenciesById[dependency.id] = dependency
104 else:
105 tokensById[dependency.fro.id].dependencies.append(dependency)
106 tokensById[dependency.to.id].dependencies.append(dependency)
107
108
109 dependency.id = dependencyIndex
110 assert( not dependenciesById.has_key(dependency.id) )
111 dependenciesById[dependency.id] = dependency
112 dependencyIndex += 1
113
114 return tokensById, dependenciesById
115
116
117
118
119
121 """ Marks tokens belonging to named entities
122 """
123 namedEntityTokens = []
124 for entityElement in entityElements:
125 offsets = []
126 offsetStrings = entityElement.attrib["charOffset"].split(",")
127 for offsetString in offsetStrings:
128 charFrom, charTo = offsetString.split("-")
129 offset = (int(charFrom), int(charTo))
130 offsets.append(offset)
131 for k,v in self.tokensById.iteritems():
132 for offset in offsets:
133 if Range.overlap(offset, v.charOffset):
134 v.entities.append(entityElement.attrib["id"])
135 namedEntityTokens.append(v.id)
136 return namedEntityTokens
137
139 """ Returns the ids of all tokens in specified named entities
140 """
141 tokenIds = []
142 for key, node in self.tokensById.iteritems():
143 for id in namedEntityIds:
144 if id in node.entities:
145 tokenIds.append(node.id)
146 return tokenIds
147
148 - def getTokenIdsByText(self, texts, lookInsideNamedEntities=False):
149 """ Returns the ids of all tokens whose text attribute can be
150 found in the list texts. Can be used f.e. detecting interaction
151 words.
152 """
153 matchingTokens = []
154 for node in self.tokensById.values():
155 if len(node.entities) > 0 and not lookInsideNamedEntities:
156 continue
157 if node.text.lower() in texts:
158 matchingTokens.append(node.id)
159 elif node.text.find("-") != -1:
160 tempText = node.text.rsplit("-",1)[1]
161 if tempText.lower() in texts:
162 matchingTokens.append(node.id)
163 return matchingTokens
164
166 for token in self.tokensById.values():
167 token.stem = stemmer.stem(token.text)
168
169
170
171
172
174 """ Initializes the graph structure used by NetworkX.
175 Called automatically when needed.
176 """
177 import networkx as NX
178 self.nXGraph = NX.Graph()
179 for token in self.tokensById.values():
180 self.nXGraph.add_node(token.id)
181 for dependency in self.dependenciesById.values():
182 self.nXGraph.add_node(dependency.id)
183 self.nXGraph.add_edge((dependency.fro.id,dependency.id))
184 self.nXGraph.add_edge((dependency.id,dependency.to.id))
185
187 """ Build shortest path using NetworkX
188 """
189 import networkx as NX
190 if (not hasattr(self,"nXGraph")) or self.nXGraph == None:
191 self.buildNXGraph()
192 path = NX.shortest_path(self.nXGraph, startTokenId, endTokenId)
193 if path == False:
194 return []
195 isTokenPhase = True
196 for i in range(len(path)):
197 if isTokenPhase:
198 path[i] = self.tokensById[path[i]]
199 else:
200 path[i] = self.dependenciesById[path[i]]
201 isTokenPhase = not isTokenPhase
202 return path
203