Package TEES :: Package Utils :: Package InteractionXML :: Module CorpusElements
[hide private]

Source Code for Module TEES.Utils.InteractionXML.CorpusElements

 1  from SentenceElements import * 
 2  import types 
 3  import sys, os 
 4  thisPath = os.path.dirname(os.path.abspath(__file__)) 
 5  sys.path.append(os.path.abspath(os.path.join(thisPath,".."))) 
 6  import Utils.ElementTreeUtils as ETUtils 
 7   
8 -def loadCorpus(filename, parse=None, tokenization=None, removeIntersentenceInteractions=True, removeNameInfo=False):
9 try: 10 import xml.etree.cElementTree as ET 11 except ImportError: 12 import cElementTree as ET 13 import sys, gzip 14 15 if type(filename) == types.StringType: 16 print >> sys.stderr, "Loading corpus file", filename 17 corpusTree = ETUtils.ETFromObj(filename) 18 corpusRoot = corpusTree.getroot() 19 return CorpusElements(corpusRoot, parse, tokenization, removeIntersentenceInteractions, corpusTree, removeNameInfo)
20
21 -class CorpusElements:
22 - def __init__(self, rootElement, parse, tokenization=None, removeIntersentenceInteractions=True, tree=None, removeNameInfo=False):
23 self.tree = tree 24 self.rootElement = rootElement 25 self.documents = rootElement.findall("document") 26 self.documentsById = {} 27 self.sentencesById = {} 28 self.sentencesByOrigId = {} 29 self.sentences = [] 30 self.documentSentences = [] 31 counts = {"sentences":0, "missing-tok":0, "missing-parse":0} 32 for documentElement in self.documents: 33 self.documentsById[documentElement.attrib["id"]] = documentElement 34 sentenceElements = documentElement.findall("sentence") 35 self.documentSentences.append([]) 36 for sentenceElement in sentenceElements: 37 counts["sentences"] += 1 38 sentenceObj = SentenceElements(sentenceElement, parse, tokenization, removeIntersentenceInteractions) 39 self.sentencesById[sentenceElement.attrib["id"]] = sentenceObj 40 if sentenceElement.attrib.has_key("origId"): 41 self.sentencesByOrigId[sentenceElement.attrib["origId"]] = sentenceObj 42 self.sentences.append(sentenceObj) 43 self.documentSentences[-1].append(sentenceObj) 44 if parse != None and sentenceObj.tokenizationElement == None: 45 counts["missing-tok"] += 1 46 if parse != None and sentenceObj.parseElement == None: 47 counts["missing-parse"] += 1 48 if counts["missing-tok"] + counts["missing-parse"] > 0: 49 print >> sys.stderr, "Warning, parse missing from", counts["missing-parse"], "and tokenization from", counts["missing-tok"], "sentences out of a total of", counts["sentences"] 50 print >> sys.stderr, "Requested parse", parse, "and tokenization", tokenization
51