Package TEES :: Package Utils :: Package Convert :: Module processLPSN
[hide private]

Source Code for Module TEES.Utils.Convert.processLPSN

 1  f = open("/home/jari/data/BioNLP11SharedTask/resources/lpsn-alintro.html", "rt") 
 2  outFile = open("/home/jari/data/BioNLP11SharedTask/resources/lpsn-bacteria-names.txt", "wt") 
 3   
 4  count = 0 
 5  for line in f: 
 6      count += 1 
 7      if count < 178: 
 8          continue 
 9      if "#FF0000" in line: 
10          splits = line.strip().split("<font color=\"#FF0000\"><i><b>") 
11          #print splits 
12          tokens = [] 
13          for split in splits: 
14              if "</b></i>" in split: 
15                  split2 = split.split("</b></i>")[0] 
16                  assert split2.strip() == split2, (split2, line) 
17                  if split2[0] != "<" and split2[-1] != ">": 
18                      tokens.append(split2) 
19          tokenString = " ".join(tokens).strip() 
20          if tokenString != "": 
21              print tokenString 
22              outFile.write(tokenString + "\n") 
23   
24  f.close() 
25  outFile.close() 
26