这是我两年前写的一个程序。有需要的可以拿去分析。
"""从搜索引擎查询结果返回的第一页html提取信息,本项目的核心程序。""" import re from bs4 import BeautifulSoup import os from random import choice from time import time import urllib from urllib.request import urlopen import requests 手机号正则=re.compile(r"1[34578]\d{9}") blacklist=['baishicha.com','dianping.com','135130.com', 'jiweixin168.com', '0595lh.com', 'chahaoyi.com', 'soudianhua.com', 'ip138.com', 'hiphop8.com', 'gpxz.com', 'sohao.org', 'shouji.360.cn', '911cha.com', 'shoujichahao.com', 'mv188.com', 'juzhao.net', 'qqdna.com', 'hdjr.org', 'quanzh.com', 'chahaoba.com', 'guisd.com', 'qzbst.net', 'qeayer.com', 'chazidian.com', 'sjhcx.com', 'jihaoba.com', '2dianying.net', '0597zp.com', 'supfree.net', 'xysjk.com', 'beyaiw.com', 'sjgsd.com', 'chaohaoyi.com', '03412.com', 'gdhaoma.com', 'haomadi.com', 'qq5533.com', 'guoguo-app.com', '9om.com', 'xingming.net', 'tongmengguo.com', '51so.cc', '27txt.com', '51hao.cc', 'dyndns.tv', 'tool.la', '17012345.com', 'phone001.com', 'dunkun.com', 'chaxuntu.com', 'chinxm.com', 'niukaba.com', 'tiaohao.com', 'sojkan.com', 'shoujigenzong.com', '139018.com', 'chincm.com', 'labelbao.com', 'ashercall.info', 'fpsjk.org', '13131313131.com', 'rrsms.com', '139018.com', 'search-phone-owner.com', 'netrui.com', '82029.com', 'wiki-numbers.ru', 'who-called.biz', 'haomazhuanjia.com', 'mshaoma.com', '100170.net', 'beyaiw.com', 'adorkablepet.net', 'xgfc.net', 'aviciifans.com', '135cha.com', 'tiantianxieye.com', 'yenwoop.com', 'chaxunw.com', 'aiqzu.net', 'fzrc.org', 'herowind.com'] def 随机生成文件名(): 字符集='abcdefghijklmnopqrstuvwxyz01234567890' filename="".join([ choice(字符集) for _ in range(10)]) s=str(time()) s=s.split(".")[-1] return filename + s + ".html" def geturltext3(url): resp = requests.get(url, timeout=5, verify=True) #可以返回resp.raw,resp.content,resp.text,resp.json #print(resp.status_code) #200正常 resp.encoding='utf-8' content=resp.text return content def geturltext2(url): """得到文本型网页源代码""" content="" htmlcodestyle='utf-8' #根据网页的编码进行更改 网页打开器 = urllib.request.URLopener() 网页打开器.addheaders = [] 网页打开器.addheader('User-Agent', 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.154 Safari/537.36') 网页打开器.addheader('Accept-Language', 'zh-CN,zh;q=0.8,en;q=0.6') 网页打开器.addheader('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9, image/png, image/webp, image/jpeg, image/gif, image/x-xbitmap, */*;') #print("正打开网址:" + url) try: f = 网页打开器.open(url) content = f.read().decode(htmlcodestyle) f.close() except:#如果发生意外,则调用requests模块的get方法进行获取网页源代码 content= geturltext3(url) return content def geturltext(url): filename= 随机生成文件名() yy=os.popen("scrapy fetch " + url + " --nolog >> " + filename) #不会出现黑框的DOS命令提示符 yy.close() #等待上面的指令执行完毕 f=open(filename,mode='rb') html=f.read() f.close() os.remove(filename) html=html.decode(encoding='utf-8') #这里针对webcrawler进行特殊处理,因为它可能会弹出验证码,要参加什么调查之类的 #下载的网页里面有这样的标签:<div class="captcha-container"> if len(html)<1000 or "captcha-container" in html: #认为返回结果不正常 html=geturltext2(url) return html def puretitle(title): title=title.split("–")[0] title=title.split("_")[0] title=title.split("-")[0] title=title.split("|")[0] title=title.strip() return title def is_chinese(uchar): """ 判断一个unicode是否是汉字 """ if uchar >= u'\u4e00' and uchar<=u'\u9fa5': return True else : return False def containchinese(astring): ischinesestring=False for c in astring: if is_chinese(c): ischinesestring=True break return ischinesestring def extractdomain_bing(url): """从网址提取域名,for bing""" L=url.split("/") domain=L[0] if url.startswith("https://") or url.startswith("http://"):domain=L[2] words=domain.split(".") domain=words[-2] + "." + words[-1] return domain def extractfrombing(mobile,html): """成功则返回元组(True,有意义数据),否则返回(False,'') """ global 手机号正则,blacklist soup=BeautifulSoup(html,'lxml') result=soup.find_all('li',{'class':'b_algo'}) availableitems=[] for item in result: #0:标题 title=item.find_all('h2')[0].text if not containchinese(title):continue #不包括中文字,忽略 title=puretitle(title) sourceurl=item.find_all('cite')[0].text domain=extractdomain_bing(sourceurl) if domain in blacklist: continue brief=item.find_all('p')[0].text rs=re.findall(手机号正则,brief) #我们认为简介中有连续3个以上的手机号,这网站就没有此手机号的相关信息 if len(rs)>=3:continue if not mobile in brief:continue #如果简介中没有手机号,也忽略 availableitems.append((title,brief,sourceurl)) break if availableitems: return (True,(mobile,availableitems[0])) else: return (False,"") def extractdomain_chinaso(url): """从网址提取域名,for chinaso""" if "..." in url: domain=url.split("...")[0] else: L=url.split("\xa0") url=L[-2] L=url.split("/") domain=L[0] if url.startswith("https://") or url.startswith("http://"):domain=L[2] words=domain.split(".") domain=words[-2] + "." + words[-1] return domain def extractfromchinaso(mobile,html): """分析从中国搜索查询后的源码html,成功则返回元组(True,有意义数据),否则返回(False,'') http://www.chinaso.com/ """ global 手机号正则,blacklist soup=BeautifulSoup(html,'lxml') result=soup.find_all('li',{'class':'reItem '}) availableitems=[] for i in range(1,len(result)): item=result[i] #0:标题, title=item.find_all('h2')[0].text if not containchinese(title):continue #不包括中文字,忽略 title=puretitle(title) sourceurl=item.find_all('p',{'class':'snapshot'})[0].text domain=extractdomain_chinaso(sourceurl) if domain in blacklist: continue brief=item.find_all('p')[0].text rs=re.findall(手机号正则,brief) #我们认为简介中有连续3个以上的手机号,这网站就没有此手机号的相关信息 if len(rs)>=3:continue if not mobile in brief:continue #如果简介中没有手机号,也忽略 availableitems.append((title,brief,sourceurl)) break if availableitems: return (True,(mobile,availableitems[0])) else: return (False,"") def extractdomain_360(url): """从网址提取域名,for 360""" L=url.split("/") domain=L[2] words=domain.split(".") domain=words[-2] + "." + words[-1] return domain def extractfrom360(mobile,html): """分析从360搜索查询后的源码html,成功则返回元组(True,有意义数据),否则返回(False,'') http://www.so.com/ """ global 手机号正则,blacklist soup=BeautifulSoup(html,'lxml') result=soup.find_all('li',{'class':'res-list'}) availableitems=[] for i in range(1,len(result)): item=result[i] #0:标题,这个标题里包括了源网址 h3tag=item.find_all('h3',{'class':'res-title'})[0] title=h3tag.text if not containchinese(title):continue #不包括中文字,忽略 title=puretitle(title) linkdata=h3tag.find_all('a')[0] if linkdata.has_attr('data-url'): sourceurl=linkdata['data-url'] else: sourceurl=linkdata['href'] domain=extractdomain_360(sourceurl) if domain in blacklist: continue brieftag=item.find_all('p',{'class':'res-desc'}) if len(brieftag)>0: brief=brieftag[0].text else: brieftag=item.find_all('div',{'class':'res-rich res-realty clearfix'}) if len(brieftag)>0: brief=brieftag[0].text else: brieftag=item.find_all('p') if len(brieftag)>2: brief=brieftag[1].text else: continue rs=re.findall(手机号正则,brief) #我们认为简介中有连续3个以上的手机号,这网站就没有此手机号的相关信息 if len(rs)>=3:continue if not mobile in brief:continue #如果简介中没有手机号,也忽略 availableitems.append((title,brief,sourceurl)) break #查到一个可用的数据就行了 if availableitems: return (True,(mobile,availableitems[0])) else: return (False,"") def extractdomain_baidu(url): """从网址提取域名,for baidu""" L=url.split("/") if url.startswith("https://") or url.startswith("http://") : domain=L[2] else: domain=L[0] words=domain.split(".") domain=words[-2] + "." + words[-1] return domain def extractfrombaidu(mobile,html): """分析从baidu搜索查询后的源码html,成功则返回元组(True,有意义数据),否则返回(False,'') http://www.baidu.com/ """ global 手机号正则,blacklist soup=BeautifulSoup(html,'lxml') #查找有没有被百度手机卫士标记 baiduguard=soup.find_all('div',{'class':'op_fraudphone_word'}) title0="" if baiduguard: #如果不是空的 title0=baiduguard[0].find_all('strong')[0].text title0=title0.strip("\"") result=soup.find_all('div',{'class':'result c-container '}) availableitems=[] for i in range(0,len(result)): item=result[i] #0:标题, h3tag=item.find_all('h3',{'class':'t'})[0] title=h3tag.text if not containchinese(title):continue #不包括中文字,忽略 title=puretitle(title) sourceurl=item.find_all('a',{'class':'c-showurl'})[0].text domain=extractdomain_baidu(sourceurl) if domain in blacklist: continue brieftag=item.find_all('div',{'class':'c-abstract'}) brief=brieftag[0].text rs=re.findall(手机号正则,brief) #我们认为简介中有连续3个以上的手机号,这网站就没有此手机号的相关信息 if len(rs)>=3:continue if not mobile in brief:continue #如果简介中没有手机号,也忽略 if not( title0 in title): title=title0 + ":" + title availableitems.append((title ,brief,sourceurl)) break if availableitems: return (True,(mobile,availableitems[0])) else: if title0=="": return (False,"") else: return (True,(mobile,title0)) def extractdomain_sogou(url): """从网址提取域名,for sogou""" if "翻译此页" in url: url=url.split("翻译此页")[0] L=url.split("...") url=L[0] if "/" in url: url=url.split("/")[0] if "-" in url: url=url.split("-")[-1] domain=url words=domain.split(".") domain=words[-2] + "." + words[-1] return domain def extracesogouhaomatong(html): """抓取搜狗搜索后出现的号码通信息""" 号码通抓取正则=re.compile(r"var queryphoneinfo = '(.+)'\.replace") ret=re.findall(号码通抓取正则,html) if ret: company=ret[0] return company.split(":")[1] return "" def extractfromsogou(mobile,html): """分析从sogou搜索查询后的源码html,成功则返回元组(True,有意义数据),否则返回(False,'') http://www.sogou.com/ """ global 手机号正则,blacklist title0=extracesogouhaomatong(html) #查找有没有被搜狗号码通用户标记 soup=BeautifulSoup(html,'lxml') result=soup.find_all('div',{'class':'vrwrap'}) availableitems=[] for i in range(1,len(result)): item=result[i] #0:标题, h3tag=item.find_all('h3',{'class':'vrTitle'})[0] title=h3tag.text if not containchinese(title):continue #不包括中文字,忽略 title=puretitle(title) sourceurl=item.find_all('cite')[0].text domain=extractdomain_sogou(sourceurl) if domain in blacklist: continue brieftag=item.find_all('p',{'class':'str_info'}) if not brieftag: #如果为空那么只有文字介绍 brieftag=item.find_all('li',{'class':'str-text-info'}) if not brieftag:continue brief=brieftag[0].text rs=re.findall(手机号正则,brief) #我们认为简介中有连续3个以上的手机号,这网站就没有此手机号的相关信息 if len(rs)>=3:continue if not mobile in brief:continue #如果简介中没有手机号,也忽略 if not( title0 in title): title=title0 + ":" + title availableitems.append((title ,brief,sourceurl)) #再次搜索标签,老的搜狗结果标签是div class="rb" if not availableitems: result=soup.find_all('div',{'class':'rb'}) for i in range(0,len(result)): item=result[i] h3tag=item.find_all('h3',{'class':'pt'}) if h3tag==[]:continue title=h3tag[0].text if not containchinese(title):continue #不包括中文字,忽略 title=puretitle(title) #寻找简介描述 brieftag=item.find_all("div",{'class':'ft'}) if brieftag==[]:continue brief=brieftag[0].text rs=re.findall(手机号正则,brief) #我们认为简介中有连续3个以上的手机号,这网站就没有此手机号的相关信息 if len(rs)>=3:continue if not mobile in brief:continue #如果简介中没有手机号,也忽略 #寻找源URL sourceurltag=item.find_all("div",{'class':'fb'}) if sourceurltag==[]:continue sourceurl=sourceurltag[0].text domain=extractdomain_sogou(sourceurl) if domain in blacklist:continue if not( title0 in title): title=title0 + ":" + title availableitems.append((title ,brief,sourceurl)) break if availableitems: return (True,(mobile,availableitems[0])) else: if title0=="": return (False,"") else: return (True,(mobile,title0)) def extractdomain_webcrawler(url): """从网址提取域名,for webcrawler""" url=url.split("/")[0] domain=url words=domain.split(".") domain=words[-2] + "." + words[-1] return domain def extractfromwebcrawler(mobile,html): """分析从webcrawler搜索查询后的源码html,成功则返回元组(True,有意义数据),否则返回(False,'') http://www.webcrawler.com/ """ global 手机号正则,blacklist soup=BeautifulSoup(html,'lxml') result=soup.find_all('div',{'class':'result'}) availableitems=[] for i in range(0,len(result)): item=result[i] #0:标题, titletag=item.find_all('a',{'class':'title'}) if titletag==[]:continue title=titletag[0].text if not containchinese(title):continue #不包括中文字,忽略 title=puretitle(title) #源网址 sourceurltag=item.find_all('span',{'class':'url'}) if sourceurltag==[]:continue sourceurl=sourceurltag[0].text domain=extractdomain_webcrawler(sourceurl) if domain in blacklist:continue #简介与描述 brieftag=item.find_all('span') if brieftag==[]:continue brief=brieftag[1].text.strip() rs=re.findall(手机号正则,brief) #我们认为简介中有连续3个以上的手机号,这网站就没有此手机号的相关信息 if len(rs)>=3:continue if not mobile in brief:continue availableitems.append((title ,brief,sourceurl)) break if availableitems: return (True,(mobile,availableitems[0])) else: return (False,"") def extractdomain_search(url): """从网址提取域名,for search.com""" linksep=url.split("/") if url.startswith("http://") or url.startswith("https://"): url=linksep[2] else: url=linksep[0] domain=url words=domain.split(".") domain=words[-2] + "." + words[-1] return domain def extractfromsearch(mobile,html): """分析从search.com搜索查询后的源码html,成功则返回元组(True,有意义数据),否则返回(False,'') https://www.search.com/ """ global 手机号正则,blacklist soup=BeautifulSoup(html,'lxml') result=soup.find_all('div',{'class':'web-result'}) availableitems=[] for i in range(0,len(result)): item=result[i] #0:标题, titletag=item.find_all('a',{'class':'web-result-title-link'}) if titletag==[]:continue title=titletag[0].text if not containchinese(title):continue #不包括中文字,忽略 title=puretitle(title) #源网址 sourceurltag=item.find_all('p',{'class':'web-result-url'}) if sourceurltag==[]:continue sourceurl=sourceurltag[0].text domain=extractdomain_search(sourceurl) if domain in blacklist:continue #简介与描述 brieftag=item.find_all('p',{'class':'web-result-description'}) if brieftag==[]:continue brief=brieftag[0].text.strip() rs=re.findall(手机号正则,brief) #我们认为简介中有连续3个以上的手机号,这网站就没有此手机号的相关信息 if len(rs)>=3:continue if not mobile in brief:continue availableitems.append((title ,brief,sourceurl)) break if availableitems: return (True,(mobile,availableitems[0])) else: return (False,"") def extractdomain_dogpile(url): """从网址提取域名,for dogpile.com""" linksep=url.split("/") url=linksep[0] domain=url words=domain.split(".") domain=words[-2] + "." + words[-1] return domain def extractfromdogpile(mobile,html): """分析从dogpile.com搜索查询后的源码html,成功则返回元组(True,有意义数据),否则返回(False,'') https://www.dogpile.com/ """ global 手机号正则,blacklist soup=BeautifulSoup(html,'lxml') result=soup.find_all('div',{'class':'searchResult webResult'}) availableitems=[] for i in range(0,len(result)): item=result[i] #0:标题, titletag=item.find_all('a',{'class':'resultTitle'}) if titletag==[]:continue title=titletag[0].text if not containchinese(title):continue #不包括中文字,忽略 title=puretitle(title) #源网址 sourceurltag=item.find_all('a',{'class':'resultDisplayUrl'}) if sourceurltag==[]:continue sourceurl=sourceurltag[0].text domain=extractdomain_dogpile(sourceurl) if domain in blacklist:continue #简介与描述 brieftag=item.find_all('div',{'class':'resultDescription'}) if brieftag==[]:continue brief=brieftag[0].text.strip() rs=re.findall(手机号正则,brief) #我们认为简介中有连续3个以上的手机号,这网站就没有此手机号的相关信息 if len(rs)>=3:continue if not mobile in brief:continue availableitems.append((title ,brief,sourceurl)) break if availableitems: return (True,(mobile,availableitems[0])) else: return (False,"") def extractdomain_ecosia(url): """从网址提取域名,for ecosia.com""" linksep=url.split("/") if url.startswith("http://") or url.startswith("https://"): url=linksep[2] else: url=linksep[0] domain=url words=domain.split(".") domain=words[-2] + "." + words[-1] return domain def extractfromecosia(mobile,html): """分析从ecosia.com搜索查询后的源码html,成功则返回元组(True,有意义数据),否则返回(False,'') https://www.ecosia.org/ """ global 手机号正则,blacklist soup=BeautifulSoup(html,'lxml') result=soup.find_all('div',{'class':'result js-result card-mobile'}) availableitems=[] for i in range(0,len(result)): item=result[i] #0:标题, titletag=item.find_all('a',{'class':'result-title js-result-title'}) if titletag==[]:continue title=titletag[0].text if not containchinese(title):continue #不包括中文字,忽略 title=puretitle(title) #源网址 sourceurltag=item.find_all('a',{'class':'result-url js-result-url'}) if sourceurltag==[]:continue sourceurl=sourceurltag[0].text domain=extractdomain_ecosia(sourceurl) if domain in blacklist:continue #简介与描述 brieftag=item.find_all('p',{'class':'result-snippet'}) if brieftag==[]:continue brief=brieftag[0].text.strip() rs=re.findall(手机号正则,brief) #我们认为简介中有连续3个以上的手机号,这网站就没有此手机号的相关信息 if len(rs)>=3:continue if not mobile in brief:continue availableitems.append((title ,brief,sourceurl)) break if availableitems: return (True,(mobile,availableitems[0])) else: return (False,"") def extractdomain_qwant(url): """从网址提取域名,for qwant.com""" linksep=url.split("/") if url.startswith("http://") or url.startswith("https://"): url=linksep[2] else: url=linksep[0] domain=url words=domain.split(".") domain=words[-2] + "." + words[-1] return domain def extractfromqwant(mobile,html): """分析从lite.qwant.com搜索查询后的源码html,成功则返回元组(True,有意义数据),否则返回(False,'') https://lite.qwant.com/ """ global 手机号正则,blacklist soup=BeautifulSoup(html,'lxml') result=soup.find_all('div',{'class':'result'}) availableitems=[] for i in range(0,len(result)): item=result[i] #0:标题, titletag=item.find_all('a') if titletag==[]:continue title=titletag[0].text if not containchinese(title):continue #不包括中文字,忽略 title=puretitle(title) #源网址 sourceurltag=item.find_all('p',{'class':'url'}) if sourceurltag==[]:continue sourceurl=sourceurltag[0].text domain=extractdomain_qwant(sourceurl) if domain in blacklist:continue #简介与描述 brieftag=item.find_all('p') if brieftag==[]:continue brief=brieftag[1].text.strip() rs=re.findall(手机号正则,brief) #我们认为简介中有连续3个以上的手机号,这网站就没有此手机号的相关信息 if len(rs)>=3:continue if not mobile in brief:continue availableitems.append((title ,brief,sourceurl)) break if availableitems: return (True,(mobile,availableitems[0])) else: return (False,"") def extractmobileinfo(se,mobile,html): """se的值可能为'bing','chinaso','360','baidu','sogou','webcrawler','search' ,'dogpile','ecosia','qwang' 返回结果为:(True,(手机号,(标题,简述,源url)))或(False,"") """ if se=='bing': ret=extractfrombing(mobile,html) if se=='chinaso': ret=extractfromchinaso(mobile,html) if se=='360': ret=extractfrom360(mobile,html) if se=='baidu': ret=extractfrombaidu(mobile,html) if se=='sogou': ret=extractfromsogou(mobile,html) if se=='webcrawler': ret=extractfromwebcrawler(mobile,html) if se=='search': ret=extractfromsearch(mobile,html) if se=='dogpile': ret=extractfromdogpile(mobile,html) if se=='ecosia': ret=extractfromecosia(mobile,html) if se=='qwant': ret=extractfromqwant(mobile,html) return ret if __name__=='__main__': pass """ se='360' mobilehead='137146809' for i in range(83,100): mobile=mobilehead + str(i) url="https://www.so.com/s?q=" + mobile print("搜索:" + url ) html=geturltext(url) #得到源码 f=open("C:\\"+ se + "_" + mobile + ".html",mode='w',encoding='utf-8') f.write(html) f.close() print(extractmobileinfo(se,mobile,html)) print() """