1  from SentenceElements import * 
 2  import types 
 3  import sys, os 
 4  thisPath = os.path.dirname(os.path.abspath(__file__)) 
 5  sys.path.append(os.path.abspath(os.path.join(thisPath,".."))) 
 6  import Utils.ElementTreeUtils as ETUtils 
 7   
 8 -def loadCorpus(filename, parse=None, tokenization=None, removeIntersentenceInteractions=True, removeNameInfo=False): 
  9      try: 
10          import xml.etree.cElementTree as ET 
11      except ImportError: 
12          import cElementTree as ET 
13      import sys, gzip 
14       
15      if type(filename) == types.StringType: 
16          print >> sys.stderr, "Loading corpus file", filename 
17      corpusTree = ETUtils.ETFromObj(filename) 
18      corpusRoot = corpusTree.getroot() 
19      return CorpusElements(corpusRoot, parse, tokenization, removeIntersentenceInteractions, corpusTree, removeNameInfo) 
 20   
22 -    def __init__(self, rootElement, parse, tokenization=None, removeIntersentenceInteractions=True, tree=None, removeNameInfo=False): 
 23          self.tree = tree 
24          self.rootElement = rootElement 
25          self.documents = rootElement.findall("document") 
26          self.documentsById = {} 
27          self.sentencesById = {} 
28          self.sentencesByOrigId = {} 
29          self.sentences = [] 
30          self.documentSentences = [] 
31          counts = {"sentences":0, "missing-tok":0, "missing-parse":0} 
32          for documentElement in self.documents: 
33              self.documentsById[documentElement.attrib["id"]] = documentElement 
34              sentenceElements = documentElement.findall("sentence") 
35              self.documentSentences.append([]) 
36              for sentenceElement in sentenceElements: 
37                  counts["sentences"] += 1 
38                  sentenceObj = SentenceElements(sentenceElement, parse, tokenization, removeIntersentenceInteractions) 
39                  self.sentencesById[sentenceElement.attrib["id"]] = sentenceObj 
40                  if sentenceElement.attrib.has_key("origId"): 
41                      self.sentencesByOrigId[sentenceElement.attrib["origId"]] = sentenceObj 
42                  self.sentences.append(sentenceObj) 
43                  self.documentSentences[-1].append(sentenceObj) 
44                  if parse != None and sentenceObj.tokenizationElement == None: 
45                      counts["missing-tok"] += 1 
46                  if parse != None and sentenceObj.parseElement == None: 
47                      counts["missing-parse"] += 1 
48          if counts["missing-tok"] + counts["missing-parse"] > 0: 
49              print >> sys.stderr, "Warning, parse missing from", counts["missing-parse"], "and tokenization from", counts["missing-tok"], "sentences out of a total of", counts["sentences"] 
50              print >> sys.stderr, "Requested parse", parse, "and tokenization", tokenization 
  51