#!/usr/bin/python import re from mechanize import Browser from urlparse import urljoin keywordall=[["min","mhuang","m. ?huang","min"],\ ["pengjia","pzhu","pengjia","pengj"],\ ["chao","chao"],\ ["melissa","M?.? ?cummings","melissa"],\ ["toby","toby","T.? ?Badman","badman"],\ ["ryan","R?.? ?zielinski"],\ ["kalyan","kalyan","allada"],\ ["jixie","jixie"],\ ["jie","jie"],\ ["jp","jianping","j.? ?p.? ? ?chen","J.? ?P"],\ ["alexandre","A?.? ?Camsonne"],\ ["james","j.? ?maxwell"],\ ["karl","k?.? ?slifer"],\ ["vince","vince","V?.? ?Sulkosky"]] ignword=["\[target\]"] br=Browser() br.set_handle_robots(False) br.set_handle_refresh(False) br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] def namematch(x,text): m=re.compile("[\s\\/,.]"+x+"[\s\\/,.]",re.I) return re.search(m,text) def ignmatch(x,text): m=re.compile(x,re.I) return re.search(m,text) def makehtml(): html= """\ """ print >> htmlfile,html for i in [1205,1204,1203,1202,1201,1112,1111,1110]: url="http://hallaweb.jlab.org/halog/log/html/%i_archive/logdir_noauto2.html"%i br.open(url) for link in br.links(): if any(map(lambda x:namematch(x,link.text),keyword)) and not any(map(lambda x:ignmatch(x,link.text),ignword)): absurl=urljoin(link.base_url,link.url) html=" %s
"%(absurl,link.text) print >> htmlfile,html html="""\ """ print >> htmlfile,html for key in keywordall: htmlfile=open("/home/pzhu/halog_person/%s.html"%key[0],"w") keyword=key[1:] makehtml()