这是我两年前写的一个程序。有需要的可以拿去分析。
"""从搜索引擎查询结果返回的第一页html提取信息,本项目的核心程序。"""
import re
from bs4 import BeautifulSoup
import os
from random import choice
from time import time
import urllib
from urllib.request import urlopen
import requests
手机号正则=re.compile(r"1[34578]\d{9}")
blacklist=['baishicha.com','dianping.com','135130.com', 'jiweixin168.com', '0595lh.com', 'chahaoyi.com', 'soudianhua.com', 'ip138.com', 'hiphop8.com', 'gpxz.com', 'sohao.org', 'shouji.360.cn', '911cha.com', 'shoujichahao.com', 'mv188.com', 'juzhao.net', 'qqdna.com', 'hdjr.org', 'quanzh.com', 'chahaoba.com', 'guisd.com', 'qzbst.net', 'qeayer.com', 'chazidian.com', 'sjhcx.com', 'jihaoba.com', '2dianying.net', '0597zp.com', 'supfree.net', 'xysjk.com', 'beyaiw.com', 'sjgsd.com', 'chaohaoyi.com', '03412.com', 'gdhaoma.com', 'haomadi.com', 'qq5533.com', 'guoguo-app.com', '9om.com', 'xingming.net', 'tongmengguo.com', '51so.cc', '27txt.com', '51hao.cc', 'dyndns.tv', 'tool.la', '17012345.com', 'phone001.com', 'dunkun.com', 'chaxuntu.com', 'chinxm.com', 'niukaba.com', 'tiaohao.com', 'sojkan.com', 'shoujigenzong.com', '139018.com', 'chincm.com', 'labelbao.com', 'ashercall.info', 'fpsjk.org', '13131313131.com', 'rrsms.com', '139018.com', 'search-phone-owner.com', 'netrui.com', '82029.com', 'wiki-numbers.ru', 'who-called.biz', 'haomazhuanjia.com', 'mshaoma.com', '100170.net', 'beyaiw.com', 'adorkablepet.net', 'xgfc.net', 'aviciifans.com', '135cha.com', 'tiantianxieye.com', 'yenwoop.com', 'chaxunw.com', 'aiqzu.net', 'fzrc.org', 'herowind.com']
def 随机生成文件名():
字符集='abcdefghijklmnopqrstuvwxyz01234567890'
filename="".join([ choice(字符集) for _ in range(10)])
s=str(time())
s=s.split(".")[-1]
return filename + s + ".html"
def geturltext3(url):
resp = requests.get(url, timeout=5, verify=True) #可以返回resp.raw,resp.content,resp.text,resp.json
#print(resp.status_code) #200正常
resp.encoding='utf-8'
content=resp.text
return content
def geturltext2(url):
"""得到文本型网页源代码"""
content=""
htmlcodestyle='utf-8' #根据网页的编码进行更改
网页打开器 = urllib.request.URLopener()
网页打开器.addheaders = []
网页打开器.addheader('User-Agent', 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.154 Safari/537.36')
网页打开器.addheader('Accept-Language', 'zh-CN,zh;q=0.8,en;q=0.6')
网页打开器.addheader('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9, image/png, image/webp, image/jpeg, image/gif, image/x-xbitmap, */*;')
#print("正打开网址:" + url)
try:
f = 网页打开器.open(url)
content = f.read().decode(htmlcodestyle)
f.close()
except:#如果发生意外,则调用requests模块的get方法进行获取网页源代码
content= geturltext3(url)
return content
def geturltext(url):
filename= 随机生成文件名()
yy=os.popen("scrapy fetch " + url + " --nolog >> " + filename) #不会出现黑框的DOS命令提示符
yy.close() #等待上面的指令执行完毕
f=open(filename,mode='rb')
html=f.read()
f.close()
os.remove(filename)
html=html.decode(encoding='utf-8')
#这里针对webcrawler进行特殊处理,因为它可能会弹出验证码,要参加什么调查之类的
#下载的网页里面有这样的标签:<div class="captcha-container">
if len(html)<1000 or "captcha-container" in html: #认为返回结果不正常
html=geturltext2(url)
return html
def puretitle(title):
title=title.split("–")[0]
title=title.split("_")[0]
title=title.split("-")[0]
title=title.split("|")[0]
title=title.strip()
return title
def is_chinese(uchar):
""" 判断一个unicode是否是汉字 """
if uchar >= u'\u4e00' and uchar<=u'\u9fa5':
return True
else :
return False
def containchinese(astring):
ischinesestring=False
for c in astring:
if is_chinese(c):
ischinesestring=True
break
return ischinesestring
def extractdomain_bing(url):
"""从网址提取域名,for bing"""
L=url.split("/")
domain=L[0]
if url.startswith("https://") or url.startswith("http://"):domain=L[2]
words=domain.split(".")
domain=words[-2] + "." + words[-1]
return domain
def extractfrombing(mobile,html):
"""成功则返回元组(True,有意义数据),否则返回(False,'') """
global 手机号正则,blacklist
soup=BeautifulSoup(html,'lxml')
result=soup.find_all('li',{'class':'b_algo'})
availableitems=[]
for item in result:
#0:标题
title=item.find_all('h2')[0].text
if not containchinese(title):continue #不包括中文字,忽略
title=puretitle(title)
sourceurl=item.find_all('cite')[0].text
domain=extractdomain_bing(sourceurl)
if domain in blacklist:
continue
brief=item.find_all('p')[0].text
rs=re.findall(手机号正则,brief) #我们认为简介中有连续3个以上的手机号,这网站就没有此手机号的相关信息
if len(rs)>=3:continue
if not mobile in brief:continue #如果简介中没有手机号,也忽略
availableitems.append((title,brief,sourceurl))
break
if availableitems:
return (True,(mobile,availableitems[0]))
else:
return (False,"")
def extractdomain_chinaso(url):
"""从网址提取域名,for chinaso"""
if "..." in url:
domain=url.split("...")[0]
else:
L=url.split("\xa0")
url=L[-2]
L=url.split("/")
domain=L[0]
if url.startswith("https://") or url.startswith("http://"):domain=L[2]
words=domain.split(".")
domain=words[-2] + "." + words[-1]
return domain
def extractfromchinaso(mobile,html):
"""分析从中国搜索查询后的源码html,成功则返回元组(True,有意义数据),否则返回(False,'')
http://www.chinaso.com/ """
global 手机号正则,blacklist
soup=BeautifulSoup(html,'lxml')
result=soup.find_all('li',{'class':'reItem '})
availableitems=[]
for i in range(1,len(result)):
item=result[i]
#0:标题,
title=item.find_all('h2')[0].text
if not containchinese(title):continue #不包括中文字,忽略
title=puretitle(title)
sourceurl=item.find_all('p',{'class':'snapshot'})[0].text
domain=extractdomain_chinaso(sourceurl)
if domain in blacklist:
continue
brief=item.find_all('p')[0].text
rs=re.findall(手机号正则,brief) #我们认为简介中有连续3个以上的手机号,这网站就没有此手机号的相关信息
if len(rs)>=3:continue
if not mobile in brief:continue #如果简介中没有手机号,也忽略
availableitems.append((title,brief,sourceurl))
break
if availableitems:
return (True,(mobile,availableitems[0]))
else:
return (False,"")
def extractdomain_360(url):
"""从网址提取域名,for 360"""
L=url.split("/")
domain=L[2]
words=domain.split(".")
domain=words[-2] + "." + words[-1]
return domain
def extractfrom360(mobile,html):
"""分析从360搜索查询后的源码html,成功则返回元组(True,有意义数据),否则返回(False,'')
http://www.so.com/ """
global 手机号正则,blacklist
soup=BeautifulSoup(html,'lxml')
result=soup.find_all('li',{'class':'res-list'})
availableitems=[]
for i in range(1,len(result)):
item=result[i]
#0:标题,这个标题里包括了源网址
h3tag=item.find_all('h3',{'class':'res-title'})[0]
title=h3tag.text
if not containchinese(title):continue #不包括中文字,忽略
title=puretitle(title)
linkdata=h3tag.find_all('a')[0]
if linkdata.has_attr('data-url'):
sourceurl=linkdata['data-url']
else:
sourceurl=linkdata['href']
domain=extractdomain_360(sourceurl)
if domain in blacklist:
continue
brieftag=item.find_all('p',{'class':'res-desc'})
if len(brieftag)>0:
brief=brieftag[0].text
else:
brieftag=item.find_all('div',{'class':'res-rich res-realty clearfix'})
if len(brieftag)>0:
brief=brieftag[0].text
else:
brieftag=item.find_all('p')
if len(brieftag)>2:
brief=brieftag[1].text
else:
continue
rs=re.findall(手机号正则,brief) #我们认为简介中有连续3个以上的手机号,这网站就没有此手机号的相关信息
if len(rs)>=3:continue
if not mobile in brief:continue #如果简介中没有手机号,也忽略
availableitems.append((title,brief,sourceurl))
break #查到一个可用的数据就行了
if availableitems:
return (True,(mobile,availableitems[0]))
else:
return (False,"")
def extractdomain_baidu(url):
"""从网址提取域名,for baidu"""
L=url.split("/")
if url.startswith("https://") or url.startswith("http://") :
domain=L[2]
else:
domain=L[0]
words=domain.split(".")
domain=words[-2] + "." + words[-1]
return domain
def extractfrombaidu(mobile,html):
"""分析从baidu搜索查询后的源码html,成功则返回元组(True,有意义数据),否则返回(False,'')
http://www.baidu.com/ """
global 手机号正则,blacklist
soup=BeautifulSoup(html,'lxml')
#查找有没有被百度手机卫士标记
baiduguard=soup.find_all('div',{'class':'op_fraudphone_word'})
title0=""
if baiduguard: #如果不是空的
title0=baiduguard[0].find_all('strong')[0].text
title0=title0.strip("\"")
result=soup.find_all('div',{'class':'result c-container '})
availableitems=[]
for i in range(0,len(result)):
item=result[i]
#0:标题,
h3tag=item.find_all('h3',{'class':'t'})[0]
title=h3tag.text
if not containchinese(title):continue #不包括中文字,忽略
title=puretitle(title)
sourceurl=item.find_all('a',{'class':'c-showurl'})[0].text
domain=extractdomain_baidu(sourceurl)
if domain in blacklist:
continue
brieftag=item.find_all('div',{'class':'c-abstract'})
brief=brieftag[0].text
rs=re.findall(手机号正则,brief) #我们认为简介中有连续3个以上的手机号,这网站就没有此手机号的相关信息
if len(rs)>=3:continue
if not mobile in brief:continue #如果简介中没有手机号,也忽略
if not( title0 in title):
title=title0 + ":" + title
availableitems.append((title ,brief,sourceurl))
break
if availableitems:
return (True,(mobile,availableitems[0]))
else:
if title0=="":
return (False,"")
else:
return (True,(mobile,title0))
def extractdomain_sogou(url):
"""从网址提取域名,for sogou"""
if "翻译此页" in url:
url=url.split("翻译此页")[0]
L=url.split("...")
url=L[0]
if "/" in url:
url=url.split("/")[0]
if "-" in url:
url=url.split("-")[-1]
domain=url
words=domain.split(".")
domain=words[-2] + "." + words[-1]
return domain
def extracesogouhaomatong(html):
"""抓取搜狗搜索后出现的号码通信息"""
号码通抓取正则=re.compile(r"var queryphoneinfo = '(.+)'\.replace")
ret=re.findall(号码通抓取正则,html)
if ret:
company=ret[0]
return company.split(":")[1]
return ""
def extractfromsogou(mobile,html):
"""分析从sogou搜索查询后的源码html,成功则返回元组(True,有意义数据),否则返回(False,'')
http://www.sogou.com/ """
global 手机号正则,blacklist
title0=extracesogouhaomatong(html) #查找有没有被搜狗号码通用户标记
soup=BeautifulSoup(html,'lxml')
result=soup.find_all('div',{'class':'vrwrap'})
availableitems=[]
for i in range(1,len(result)):
item=result[i]
#0:标题,
h3tag=item.find_all('h3',{'class':'vrTitle'})[0]
title=h3tag.text
if not containchinese(title):continue #不包括中文字,忽略
title=puretitle(title)
sourceurl=item.find_all('cite')[0].text
domain=extractdomain_sogou(sourceurl)
if domain in blacklist:
continue
brieftag=item.find_all('p',{'class':'str_info'})
if not brieftag: #如果为空那么只有文字介绍
brieftag=item.find_all('li',{'class':'str-text-info'})
if not brieftag:continue
brief=brieftag[0].text
rs=re.findall(手机号正则,brief) #我们认为简介中有连续3个以上的手机号,这网站就没有此手机号的相关信息
if len(rs)>=3:continue
if not mobile in brief:continue #如果简介中没有手机号,也忽略
if not( title0 in title):
title=title0 + ":" + title
availableitems.append((title ,brief,sourceurl))
#再次搜索标签,老的搜狗结果标签是div class="rb"
if not availableitems:
result=soup.find_all('div',{'class':'rb'})
for i in range(0,len(result)):
item=result[i]
h3tag=item.find_all('h3',{'class':'pt'})
if h3tag==[]:continue
title=h3tag[0].text
if not containchinese(title):continue #不包括中文字,忽略
title=puretitle(title)
#寻找简介描述
brieftag=item.find_all("div",{'class':'ft'})
if brieftag==[]:continue
brief=brieftag[0].text
rs=re.findall(手机号正则,brief) #我们认为简介中有连续3个以上的手机号,这网站就没有此手机号的相关信息
if len(rs)>=3:continue
if not mobile in brief:continue #如果简介中没有手机号,也忽略
#寻找源URL
sourceurltag=item.find_all("div",{'class':'fb'})
if sourceurltag==[]:continue
sourceurl=sourceurltag[0].text
domain=extractdomain_sogou(sourceurl)
if domain in blacklist:continue
if not( title0 in title):
title=title0 + ":" + title
availableitems.append((title ,brief,sourceurl))
break
if availableitems:
return (True,(mobile,availableitems[0]))
else:
if title0=="":
return (False,"")
else:
return (True,(mobile,title0))
def extractdomain_webcrawler(url):
"""从网址提取域名,for webcrawler"""
url=url.split("/")[0]
domain=url
words=domain.split(".")
domain=words[-2] + "." + words[-1]
return domain
def extractfromwebcrawler(mobile,html):
"""分析从webcrawler搜索查询后的源码html,成功则返回元组(True,有意义数据),否则返回(False,'')
http://www.webcrawler.com/ """
global 手机号正则,blacklist
soup=BeautifulSoup(html,'lxml')
result=soup.find_all('div',{'class':'result'})
availableitems=[]
for i in range(0,len(result)):
item=result[i]
#0:标题,
titletag=item.find_all('a',{'class':'title'})
if titletag==[]:continue
title=titletag[0].text
if not containchinese(title):continue #不包括中文字,忽略
title=puretitle(title)
#源网址
sourceurltag=item.find_all('span',{'class':'url'})
if sourceurltag==[]:continue
sourceurl=sourceurltag[0].text
domain=extractdomain_webcrawler(sourceurl)
if domain in blacklist:continue
#简介与描述
brieftag=item.find_all('span')
if brieftag==[]:continue
brief=brieftag[1].text.strip()
rs=re.findall(手机号正则,brief) #我们认为简介中有连续3个以上的手机号,这网站就没有此手机号的相关信息
if len(rs)>=3:continue
if not mobile in brief:continue
availableitems.append((title ,brief,sourceurl))
break
if availableitems:
return (True,(mobile,availableitems[0]))
else:
return (False,"")
def extractdomain_search(url):
"""从网址提取域名,for search.com"""
linksep=url.split("/")
if url.startswith("http://") or url.startswith("https://"):
url=linksep[2]
else:
url=linksep[0]
domain=url
words=domain.split(".")
domain=words[-2] + "." + words[-1]
return domain
def extractfromsearch(mobile,html):
"""分析从search.com搜索查询后的源码html,成功则返回元组(True,有意义数据),否则返回(False,'')
https://www.search.com/ """
global 手机号正则,blacklist
soup=BeautifulSoup(html,'lxml')
result=soup.find_all('div',{'class':'web-result'})
availableitems=[]
for i in range(0,len(result)):
item=result[i]
#0:标题,
titletag=item.find_all('a',{'class':'web-result-title-link'})
if titletag==[]:continue
title=titletag[0].text
if not containchinese(title):continue #不包括中文字,忽略
title=puretitle(title)
#源网址
sourceurltag=item.find_all('p',{'class':'web-result-url'})
if sourceurltag==[]:continue
sourceurl=sourceurltag[0].text
domain=extractdomain_search(sourceurl)
if domain in blacklist:continue
#简介与描述
brieftag=item.find_all('p',{'class':'web-result-description'})
if brieftag==[]:continue
brief=brieftag[0].text.strip()
rs=re.findall(手机号正则,brief) #我们认为简介中有连续3个以上的手机号,这网站就没有此手机号的相关信息
if len(rs)>=3:continue
if not mobile in brief:continue
availableitems.append((title ,brief,sourceurl))
break
if availableitems:
return (True,(mobile,availableitems[0]))
else:
return (False,"")
def extractdomain_dogpile(url):
"""从网址提取域名,for dogpile.com"""
linksep=url.split("/")
url=linksep[0]
domain=url
words=domain.split(".")
domain=words[-2] + "." + words[-1]
return domain
def extractfromdogpile(mobile,html):
"""分析从dogpile.com搜索查询后的源码html,成功则返回元组(True,有意义数据),否则返回(False,'')
https://www.dogpile.com/ """
global 手机号正则,blacklist
soup=BeautifulSoup(html,'lxml')
result=soup.find_all('div',{'class':'searchResult webResult'})
availableitems=[]
for i in range(0,len(result)):
item=result[i]
#0:标题,
titletag=item.find_all('a',{'class':'resultTitle'})
if titletag==[]:continue
title=titletag[0].text
if not containchinese(title):continue #不包括中文字,忽略
title=puretitle(title)
#源网址
sourceurltag=item.find_all('a',{'class':'resultDisplayUrl'})
if sourceurltag==[]:continue
sourceurl=sourceurltag[0].text
domain=extractdomain_dogpile(sourceurl)
if domain in blacklist:continue
#简介与描述
brieftag=item.find_all('div',{'class':'resultDescription'})
if brieftag==[]:continue
brief=brieftag[0].text.strip()
rs=re.findall(手机号正则,brief) #我们认为简介中有连续3个以上的手机号,这网站就没有此手机号的相关信息
if len(rs)>=3:continue
if not mobile in brief:continue
availableitems.append((title ,brief,sourceurl))
break
if availableitems:
return (True,(mobile,availableitems[0]))
else:
return (False,"")
def extractdomain_ecosia(url):
"""从网址提取域名,for ecosia.com"""
linksep=url.split("/")
if url.startswith("http://") or url.startswith("https://"):
url=linksep[2]
else:
url=linksep[0]
domain=url
words=domain.split(".")
domain=words[-2] + "." + words[-1]
return domain
def extractfromecosia(mobile,html):
"""分析从ecosia.com搜索查询后的源码html,成功则返回元组(True,有意义数据),否则返回(False,'')
https://www.ecosia.org/ """
global 手机号正则,blacklist
soup=BeautifulSoup(html,'lxml')
result=soup.find_all('div',{'class':'result js-result card-mobile'})
availableitems=[]
for i in range(0,len(result)):
item=result[i]
#0:标题,
titletag=item.find_all('a',{'class':'result-title js-result-title'})
if titletag==[]:continue
title=titletag[0].text
if not containchinese(title):continue #不包括中文字,忽略
title=puretitle(title)
#源网址
sourceurltag=item.find_all('a',{'class':'result-url js-result-url'})
if sourceurltag==[]:continue
sourceurl=sourceurltag[0].text
domain=extractdomain_ecosia(sourceurl)
if domain in blacklist:continue
#简介与描述
brieftag=item.find_all('p',{'class':'result-snippet'})
if brieftag==[]:continue
brief=brieftag[0].text.strip()
rs=re.findall(手机号正则,brief) #我们认为简介中有连续3个以上的手机号,这网站就没有此手机号的相关信息
if len(rs)>=3:continue
if not mobile in brief:continue
availableitems.append((title ,brief,sourceurl))
break
if availableitems:
return (True,(mobile,availableitems[0]))
else:
return (False,"")
def extractdomain_qwant(url):
"""从网址提取域名,for qwant.com"""
linksep=url.split("/")
if url.startswith("http://") or url.startswith("https://"):
url=linksep[2]
else:
url=linksep[0]
domain=url
words=domain.split(".")
domain=words[-2] + "." + words[-1]
return domain
def extractfromqwant(mobile,html):
"""分析从lite.qwant.com搜索查询后的源码html,成功则返回元组(True,有意义数据),否则返回(False,'')
https://lite.qwant.com/ """
global 手机号正则,blacklist
soup=BeautifulSoup(html,'lxml')
result=soup.find_all('div',{'class':'result'})
availableitems=[]
for i in range(0,len(result)):
item=result[i]
#0:标题,
titletag=item.find_all('a')
if titletag==[]:continue
title=titletag[0].text
if not containchinese(title):continue #不包括中文字,忽略
title=puretitle(title)
#源网址
sourceurltag=item.find_all('p',{'class':'url'})
if sourceurltag==[]:continue
sourceurl=sourceurltag[0].text
domain=extractdomain_qwant(sourceurl)
if domain in blacklist:continue
#简介与描述
brieftag=item.find_all('p')
if brieftag==[]:continue
brief=brieftag[1].text.strip()
rs=re.findall(手机号正则,brief) #我们认为简介中有连续3个以上的手机号,这网站就没有此手机号的相关信息
if len(rs)>=3:continue
if not mobile in brief:continue
availableitems.append((title ,brief,sourceurl))
break
if availableitems:
return (True,(mobile,availableitems[0]))
else:
return (False,"")
def extractmobileinfo(se,mobile,html):
"""se的值可能为'bing','chinaso','360','baidu','sogou','webcrawler','search'
,'dogpile','ecosia','qwang'
返回结果为:(True,(手机号,(标题,简述,源url)))或(False,"")
"""
if se=='bing':
ret=extractfrombing(mobile,html)
if se=='chinaso':
ret=extractfromchinaso(mobile,html)
if se=='360':
ret=extractfrom360(mobile,html)
if se=='baidu':
ret=extractfrombaidu(mobile,html)
if se=='sogou':
ret=extractfromsogou(mobile,html)
if se=='webcrawler':
ret=extractfromwebcrawler(mobile,html)
if se=='search':
ret=extractfromsearch(mobile,html)
if se=='dogpile':
ret=extractfromdogpile(mobile,html)
if se=='ecosia':
ret=extractfromecosia(mobile,html)
if se=='qwant':
ret=extractfromqwant(mobile,html)
return ret
if __name__=='__main__':
pass
"""
se='360'
mobilehead='137146809'
for i in range(83,100):
mobile=mobilehead + str(i)
url="https://www.so.com/s?q=" + mobile
print("搜索:" + url )
html=geturltext(url) #得到源码
f=open("C:\\"+ se + "_" + mobile + ".html",mode='w',encoding='utf-8')
f.write(html)
f.close()
print(extractmobileinfo(se,mobile,html))
print()
"""
