1 f = open("/home/jari/data/BioNLP11SharedTask/resources/lpsn-alintro.html", "rt")
2 outFile = open("/home/jari/data/BioNLP11SharedTask/resources/lpsn-bacteria-names.txt", "wt")
3
4 count = 0
5 for line in f:
6 count += 1
7 if count < 178:
8 continue
9 if "#FF0000" in line:
10 splits = line.strip().split("<font color=\"#FF0000\"><i><b>")
11
12 tokens = []
13 for split in splits:
14 if "</b></i>" in split:
15 split2 = split.split("</b></i>")[0]
16 assert split2.strip() == split2, (split2, line)
17 if split2[0] != "<" and split2[-1] != ">":
18 tokens.append(split2)
19 tokenString = " ".join(tokens).strip()
20 if tokenString != "":
21 print tokenString
22 outFile.write(tokenString + "\n")
23
24 f.close()
25 outFile.close()
26