1 import sys, os, copy
2 import sys, os
3 thisPath = os.path.dirname(os.path.abspath(__file__))
4 sys.path.append(os.path.abspath(os.path.join(thisPath,"..")))
5 from Utils.ProgressCounter import ProgressCounter
6 try:
7 import xml.etree.cElementTree as ET
8 except ImportError:
9 import cElementTree as ET
10 import Utils.ElementTreeUtils as ETUtils
11 from collections import defaultdict
12 import types
13
15 """
16 A convenience function for getting an empty corpus, useful for testing for information leaks
17 in the event extraction process.
18 """
19 if type(xml) in types.StringTypes:
20
21 xml = ETUtils.ETFromObj(xml)
22 else:
23
24
25 xml = copy.deepcopy(xml)
26 if deletionRules == None:
27
28
29 if removeNames:
30 deletionRules = {"interaction":{},"entity":{}}
31 else:
32 deletionRules = {"interaction":{},"entity":{"isName":"False"}}
33
34 return processCorpus(xml, None, deletionRules)
35
37 toRemove = []
38 for element in parent.getchildren():
39 attrType = {}
40 if element.tag == elementName:
41 remove = True
42 for attrName,values in attributes.iteritems():
43 if element.get(attrName) not in values:
44 remove = False
45 break
46 else:
47 if attrName not in attrType:
48 attrType[attrName] = set()
49 attrType[attrName].add(element.get(attrName))
50 if remove:
51 toRemove.append(element)
52 countsByType[elementName + " " + str(attrType)] += 1
53 else:
54 removeElements(element, elementName, attributes, countsByType)
55 for element in toRemove:
56 parent.remove(element)
57
58
59
61 for key in sorted(rules.keys()):
62
63 removeElements(sentence, key, rules[key], countsByType)
64
66 print >> sys.stderr, "Deleting elements, rules =", rules
67 print >> sys.stderr, "Loading corpus file", inputFilename
68 corpusTree = ETUtils.ETFromObj(inputFilename)
69 corpusRoot = corpusTree.getroot()
70
71 for eType in rules.keys():
72 for attrRule in rules[eType].keys():
73 rules[eType][attrRule] = rules[eType][attrRule].split("|")
74
75 documents = corpusRoot.findall("document")
76 counter = ProgressCounter(len(documents), "Documents")
77 countsByType = defaultdict(int)
78 for document in documents:
79 counter.update()
80 for sentence in document.findall("sentence"):
81 processSentence(sentence, rules, countsByType)
82 print >> sys.stderr, "Deleted elements"
83 for k in sorted(countsByType.keys()):
84 print >> sys.stderr, " " + k + ":", countsByType[k]
85
86 if outputFilename != None:
87 print >> sys.stderr, "Writing output to", outputFilename
88 ETUtils.write(corpusRoot, outputFilename)
89 return corpusTree
90
91 if __name__=="__main__":
92 import sys
93 print >> sys.stderr, "##### Split elements with merged types #####"
94
95 from optparse import OptionParser
96
97 try:
98 import psyco
99 psyco.full()
100 print >> sys.stderr, "Found Psyco, using"
101 except ImportError:
102 print >> sys.stderr, "Psyco not installed"
103
104 optparser = OptionParser(usage="%prog [options]\nPath generator.")
105 optparser.add_option("-i", "--input", default=None, dest="input", help="Corpus in interaction xml format", metavar="FILE")
106 optparser.add_option("-o", "--output", default=None, dest="output", help="Output file in interaction xml format.")
107 optparser.add_option("-r", "--rules", default=None, dest="rules", help="dictionary of python dictionaries with attribute:value pairs.")
108 (options, args) = optparser.parse_args()
109
110 if options.input == None:
111 print >> sys.stderr, "Error, input file not defined."
112 optparser.print_help()
113 sys.exit(1)
114 if options.output == None:
115 print >> sys.stderr, "Error, output file not defined."
116 optparser.print_help()
117 sys.exit(1)
118
119
120 rules = eval(options.rules)
121 print >> sys.stderr, "Rules:", rules
122 processCorpus(options.input, options.output, rules)
123