1 from SentenceElements import *
2 import types
3 import sys, os
4 thisPath = os.path.dirname(os.path.abspath(__file__))
5 sys.path.append(os.path.abspath(os.path.join(thisPath,"..")))
6 import Utils.ElementTreeUtils as ETUtils
7
8 -def loadCorpus(filename, parse=None, tokenization=None, removeIntersentenceInteractions=True, removeNameInfo=False):
9 try:
10 import xml.etree.cElementTree as ET
11 except ImportError:
12 import cElementTree as ET
13 import sys, gzip
14
15 if type(filename) == types.StringType:
16 print >> sys.stderr, "Loading corpus file", filename
17 corpusTree = ETUtils.ETFromObj(filename)
18 corpusRoot = corpusTree.getroot()
19 return CorpusElements(corpusRoot, parse, tokenization, removeIntersentenceInteractions, corpusTree, removeNameInfo)
20
22 - def __init__(self, rootElement, parse, tokenization=None, removeIntersentenceInteractions=True, tree=None, removeNameInfo=False):
23 self.tree = tree
24 self.rootElement = rootElement
25 self.documents = rootElement.findall("document")
26 self.documentsById = {}
27 self.sentencesById = {}
28 self.sentencesByOrigId = {}
29 self.sentences = []
30 self.documentSentences = []
31 counts = {"sentences":0, "missing-tok":0, "missing-parse":0}
32 for documentElement in self.documents:
33 self.documentsById[documentElement.attrib["id"]] = documentElement
34 sentenceElements = documentElement.findall("sentence")
35 self.documentSentences.append([])
36 for sentenceElement in sentenceElements:
37 counts["sentences"] += 1
38 sentenceObj = SentenceElements(sentenceElement, parse, tokenization, removeIntersentenceInteractions)
39 self.sentencesById[sentenceElement.attrib["id"]] = sentenceObj
40 if sentenceElement.attrib.has_key("origId"):
41 self.sentencesByOrigId[sentenceElement.attrib["origId"]] = sentenceObj
42 self.sentences.append(sentenceObj)
43 self.documentSentences[-1].append(sentenceObj)
44 if parse != None and sentenceObj.tokenizationElement == None:
45 counts["missing-tok"] += 1
46 if parse != None and sentenceObj.parseElement == None:
47 counts["missing-parse"] += 1
48 if counts["missing-tok"] + counts["missing-parse"] > 0:
49 print >> sys.stderr, "Warning, parse missing from", counts["missing-parse"], "and tokenization from", counts["missing-tok"], "sentences out of a total of", counts["sentences"]
50 print >> sys.stderr, "Requested parse", parse, "and tokenization", tokenization
51