多个搜索引擎结果第一页分析程序

多个搜索引擎结果第一页分析程序

这是我两年前写的一个程序。有需要的可以拿去分析。

"""从搜索引擎查询结果返回的第一页html提取信息,本项目的核心程序。"""
import re
from bs4 import BeautifulSoup
import os
from random import choice
from time import time
import urllib
from urllib.request import urlopen 
import requests

手机号正则=re.compile(r"1[34578]\d{9}")
blacklist=['baishicha.com','dianping.com','135130.com', 'jiweixin168.com', '0595lh.com', 'chahaoyi.com', 'soudianhua.com', 'ip138.com', 'hiphop8.com', 'gpxz.com', 'sohao.org', 'shouji.360.cn', '911cha.com', 'shoujichahao.com', 'mv188.com', 'juzhao.net', 'qqdna.com', 'hdjr.org', 'quanzh.com', 'chahaoba.com', 'guisd.com', 'qzbst.net', 'qeayer.com', 'chazidian.com', 'sjhcx.com', 'jihaoba.com', '2dianying.net', '0597zp.com', 'supfree.net', 'xysjk.com', 'beyaiw.com', 'sjgsd.com', 'chaohaoyi.com', '03412.com', 'gdhaoma.com', 'haomadi.com', 'qq5533.com', 'guoguo-app.com', '9om.com', 'xingming.net', 'tongmengguo.com', '51so.cc', '27txt.com', '51hao.cc', 'dyndns.tv', 'tool.la', '17012345.com', 'phone001.com', 'dunkun.com', 'chaxuntu.com', 'chinxm.com', 'niukaba.com', 'tiaohao.com', 'sojkan.com', 'shoujigenzong.com', '139018.com', 'chincm.com', 'labelbao.com', 'ashercall.info', 'fpsjk.org', '13131313131.com', 'rrsms.com', '139018.com', 'search-phone-owner.com', 'netrui.com', '82029.com', 'wiki-numbers.ru', 'who-called.biz', 'haomazhuanjia.com', 'mshaoma.com', '100170.net', 'beyaiw.com', 'adorkablepet.net', 'xgfc.net', 'aviciifans.com', '135cha.com', 'tiantianxieye.com', 'yenwoop.com', 'chaxunw.com', 'aiqzu.net', 'fzrc.org', 'herowind.com']
def 随机生成文件名():
    字符集='abcdefghijklmnopqrstuvwxyz01234567890'
    filename="".join([ choice(字符集) for _ in range(10)]) 
    s=str(time())
    s=s.split(".")[-1]
    return filename + s + ".html"

def geturltext3(url):
    resp = requests.get(url, timeout=5, verify=True)    #可以返回resp.raw,resp.content,resp.text,resp.json
    #print(resp.status_code)                             #200正常
    resp.encoding='utf-8'             
    content=resp.text
    return content

def geturltext2(url):
    """得到文本型网页源代码"""
    content=""
    htmlcodestyle='utf-8'                   #根据网页的编码进行更改
    网页打开器 = urllib.request.URLopener() 
    网页打开器.addheaders = []
    网页打开器.addheader('User-Agent', 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.154 Safari/537.36')
    网页打开器.addheader('Accept-Language', 'zh-CN,zh;q=0.8,en;q=0.6')
    网页打开器.addheader('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9, image/png, image/webp, image/jpeg, image/gif, image/x-xbitmap, */*;')
    #print("正打开网址:" + url)
    try:
        f = 网页打开器.open(url)
        content = f.read().decode(htmlcodestyle)
        f.close()
    except:#如果发生意外,则调用requests模块的get方法进行获取网页源代码
        content= geturltext3(url)   
    return content

def geturltext(url):
    filename= 随机生成文件名()
    yy=os.popen("scrapy fetch " + url + " --nolog >> " + filename) #不会出现黑框的DOS命令提示符
    yy.close()  #等待上面的指令执行完毕
    f=open(filename,mode='rb')
    html=f.read()
    f.close()
    os.remove(filename)
    html=html.decode(encoding='utf-8')
    #这里针对webcrawler进行特殊处理,因为它可能会弹出验证码,要参加什么调查之类的
    #下载的网页里面有这样的标签:<div class="captcha-container">

    if len(html)<1000 or "captcha-container" in html:       #认为返回结果不正常
        html=geturltext2(url)
    return html



def puretitle(title):    
    title=title.split("–")[0]
    title=title.split("_")[0]
    title=title.split("-")[0]
    title=title.split("|")[0]        
    title=title.strip()
    return title

def  is_chinese(uchar):
         """ 判断一个unicode是否是汉字 """
         if  uchar >= u'\u4e00'  and  uchar<=u'\u9fa5':
                 return  True
         else :
                 return  False
def containchinese(astring):
    ischinesestring=False
    for c in astring:
        if is_chinese(c):
            ischinesestring=True
            break
    return ischinesestring

def extractdomain_bing(url):
    """从网址提取域名,for bing"""
    
    L=url.split("/")
    domain=L[0]
    if url.startswith("https://") or url.startswith("http://"):domain=L[2]
    words=domain.split(".")
    domain=words[-2] + "." + words[-1]
    return domain

def extractfrombing(mobile,html):
    """成功则返回元组(True,有意义数据),否则返回(False,'') """
    global 手机号正则,blacklist
    soup=BeautifulSoup(html,'lxml')

    result=soup.find_all('li',{'class':'b_algo'})
    availableitems=[]
    for item in result:
        #0:标题
        title=item.find_all('h2')[0].text
        if not containchinese(title):continue                 #不包括中文字,忽略
        title=puretitle(title)        
        
        sourceurl=item.find_all('cite')[0].text
        domain=extractdomain_bing(sourceurl)
        if  domain in blacklist:
            continue
        brief=item.find_all('p')[0].text
        rs=re.findall(手机号正则,brief)        #我们认为简介中有连续3个以上的手机号,这网站就没有此手机号的相关信息
        if len(rs)>=3:continue
        if not mobile in brief:continue            #如果简介中没有手机号,也忽略
        availableitems.append((title,brief,sourceurl))
        break
    if availableitems:
        return (True,(mobile,availableitems[0]))
    else:
        return (False,"")

def extractdomain_chinaso(url):
    """从网址提取域名,for chinaso"""
    
    if "..." in url:
        domain=url.split("...")[0]
    else:
        L=url.split("\xa0")
        url=L[-2]
        L=url.split("/")
        domain=L[0]
        if url.startswith("https://") or url.startswith("http://"):domain=L[2]
    words=domain.split(".")
    domain=words[-2] + "." + words[-1]
    return domain

def extractfromchinaso(mobile,html):
    """分析从中国搜索查询后的源码html,成功则返回元组(True,有意义数据),否则返回(False,'')
           http://www.chinaso.com/    """
    global 手机号正则,blacklist
    soup=BeautifulSoup(html,'lxml')
    result=soup.find_all('li',{'class':'reItem '})

    availableitems=[]
    for i in range(1,len(result)):
        item=result[i]
         
        #0:标题,
        title=item.find_all('h2')[0].text
        if not containchinese(title):continue                 #不包括中文字,忽略
        title=puretitle(title)
        
        sourceurl=item.find_all('p',{'class':'snapshot'})[0].text

        domain=extractdomain_chinaso(sourceurl)

        if  domain in blacklist:
            continue
        brief=item.find_all('p')[0].text
        rs=re.findall(手机号正则,brief)        #我们认为简介中有连续3个以上的手机号,这网站就没有此手机号的相关信息
        if len(rs)>=3:continue
        if not mobile in brief:continue            #如果简介中没有手机号,也忽略
        availableitems.append((title,brief,sourceurl))
        break
    if availableitems:
        return (True,(mobile,availableitems[0]))
    else:
        return (False,"")

def extractdomain_360(url):
    """从网址提取域名,for 360"""
    L=url.split("/")
    domain=L[2]
    words=domain.split(".")
    domain=words[-2] + "." + words[-1]
    return domain            

def extractfrom360(mobile,html):
    """分析从360搜索查询后的源码html,成功则返回元组(True,有意义数据),否则返回(False,'')
          http://www.so.com/  """
    global 手机号正则,blacklist
    soup=BeautifulSoup(html,'lxml')
    result=soup.find_all('li',{'class':'res-list'})

    availableitems=[]
    for i in range(1,len(result)):
        item=result[i]
         
        #0:标题,这个标题里包括了源网址
        h3tag=item.find_all('h3',{'class':'res-title'})[0]
        title=h3tag.text
        if not containchinese(title):continue                 #不包括中文字,忽略
        title=puretitle(title)       
        
        linkdata=h3tag.find_all('a')[0]
     
        if linkdata.has_attr('data-url'):
            sourceurl=linkdata['data-url']
        else:
            sourceurl=linkdata['href']
            
            
        domain=extractdomain_360(sourceurl)

        if  domain in blacklist:
            continue
        
        brieftag=item.find_all('p',{'class':'res-desc'})
        if len(brieftag)>0:
            brief=brieftag[0].text
        else:
            brieftag=item.find_all('div',{'class':'res-rich res-realty clearfix'})
            if len(brieftag)>0:
                brief=brieftag[0].text
            else:
                brieftag=item.find_all('p')
                if len(brieftag)>2:
                    brief=brieftag[1].text
                else:
                    continue
            
        rs=re.findall(手机号正则,brief)        #我们认为简介中有连续3个以上的手机号,这网站就没有此手机号的相关信息
        if len(rs)>=3:continue
        if not mobile in brief:continue            #如果简介中没有手机号,也忽略
        availableitems.append((title,brief,sourceurl))
        break                                      #查到一个可用的数据就行了
    if availableitems:
        return (True,(mobile,availableitems[0]))
    else:
        return (False,"")

        
def extractdomain_baidu(url):
    """从网址提取域名,for baidu"""
    L=url.split("/")
    if url.startswith("https://") or url.startswith("http://") :
        domain=L[2]
    else:
        domain=L[0]
    words=domain.split(".")
    domain=words[-2] + "." + words[-1]
    return domain

def extractfrombaidu(mobile,html):
    """分析从baidu搜索查询后的源码html,成功则返回元组(True,有意义数据),否则返回(False,'')
          http://www.baidu.com/  """
    global 手机号正则,blacklist
    soup=BeautifulSoup(html,'lxml')
    #查找有没有被百度手机卫士标记
    baiduguard=soup.find_all('div',{'class':'op_fraudphone_word'})
    title0=""
    if baiduguard:         #如果不是空的
        title0=baiduguard[0].find_all('strong')[0].text
        title0=title0.strip("\"")
        
    result=soup.find_all('div',{'class':'result c-container '})

    availableitems=[]
    for i in range(0,len(result)):
        item=result[i]
         
        #0:标题,
        h3tag=item.find_all('h3',{'class':'t'})[0]
        title=h3tag.text
        if not containchinese(title):continue                 #不包括中文字,忽略
        title=puretitle(title)        
        
                
        sourceurl=item.find_all('a',{'class':'c-showurl'})[0].text
        
        domain=extractdomain_baidu(sourceurl)

        if  domain in blacklist:
            continue
        
        brieftag=item.find_all('div',{'class':'c-abstract'})
        brief=brieftag[0].text
      
            
        rs=re.findall(手机号正则,brief)        #我们认为简介中有连续3个以上的手机号,这网站就没有此手机号的相关信息
        if len(rs)>=3:continue
        if not mobile in brief:continue            #如果简介中没有手机号,也忽略
        if not( title0 in title):
            title=title0 + ":" + title
        availableitems.append((title ,brief,sourceurl))
        break
    
    if availableitems:
        return (True,(mobile,availableitems[0]))
    else:
        if title0=="":
            return (False,"")
        else:
            return (True,(mobile,title0))

def extractdomain_sogou(url):
    """从网址提取域名,for sogou"""
    
    if "翻译此页" in url:
        url=url.split("翻译此页")[0]
    L=url.split("...")
    url=L[0]
    if "/" in url:
        url=url.split("/")[0]
    if "-" in url:
        url=url.split("-")[-1]


    domain=url
    words=domain.split(".")
    domain=words[-2] + "." + words[-1]
    return domain
def extracesogouhaomatong(html):
    """抓取搜狗搜索后出现的号码通信息"""
    号码通抓取正则=re.compile(r"var queryphoneinfo = '(.+)'\.replace")
    ret=re.findall(号码通抓取正则,html)
    if ret:
        company=ret[0]
        return company.split(":")[1]
    return ""
def extractfromsogou(mobile,html):
    """分析从sogou搜索查询后的源码html,成功则返回元组(True,有意义数据),否则返回(False,'')
          http://www.sogou.com/  """
    global 手机号正则,blacklist
    title0=extracesogouhaomatong(html)  #查找有没有被搜狗号码通用户标记
    soup=BeautifulSoup(html,'lxml')
    
    result=soup.find_all('div',{'class':'vrwrap'})
    
    availableitems=[]
    for i in range(1,len(result)):
        item=result[i]
         
        #0:标题,
        h3tag=item.find_all('h3',{'class':'vrTitle'})[0]
        title=h3tag.text
        if not containchinese(title):continue                 #不包括中文字,忽略
        title=puretitle(title)    
                        
        sourceurl=item.find_all('cite')[0].text
        
        domain=extractdomain_sogou(sourceurl)

        if  domain in blacklist:
            continue
        
        brieftag=item.find_all('p',{'class':'str_info'})
        if not brieftag:      #如果为空那么只有文字介绍
           brieftag=item.find_all('li',{'class':'str-text-info'})       
        if not brieftag:continue
        
        brief=brieftag[0].text
        rs=re.findall(手机号正则,brief)        #我们认为简介中有连续3个以上的手机号,这网站就没有此手机号的相关信息
        if len(rs)>=3:continue
        if not mobile in brief:continue            #如果简介中没有手机号,也忽略
        if not( title0 in title):
            title=title0 + ":" + title
        availableitems.append((title ,brief,sourceurl))
    #再次搜索标签,老的搜狗结果标签是div class="rb"
    if not availableitems:
         result=soup.find_all('div',{'class':'rb'})
         for i in range(0,len(result)):
             item=result[i]
             h3tag=item.find_all('h3',{'class':'pt'})
             if h3tag==[]:continue
             title=h3tag[0].text
             if not containchinese(title):continue                 #不包括中文字,忽略
             title=puretitle(title)
             #寻找简介描述
             brieftag=item.find_all("div",{'class':'ft'})
             if brieftag==[]:continue
             brief=brieftag[0].text
             rs=re.findall(手机号正则,brief)        #我们认为简介中有连续3个以上的手机号,这网站就没有此手机号的相关信息
             if len(rs)>=3:continue
             if not mobile in brief:continue            #如果简介中没有手机号,也忽略

             #寻找源URL
             sourceurltag=item.find_all("div",{'class':'fb'})
             if sourceurltag==[]:continue
             sourceurl=sourceurltag[0].text
             domain=extractdomain_sogou(sourceurl)
             if  domain in blacklist:continue
             if not( title0 in title):
                 title=title0 + ":" + title
             availableitems.append((title ,brief,sourceurl))
             break
        
    if availableitems:
        return (True,(mobile,availableitems[0]))
    else:
        if title0=="":
            return (False,"")
        else:
            return (True,(mobile,title0))

    
def extractdomain_webcrawler(url):
    """从网址提取域名,for webcrawler"""
    url=url.split("/")[0]
    domain=url
    words=domain.split(".")
    domain=words[-2] + "." + words[-1]
    return domain
def extractfromwebcrawler(mobile,html):
    """分析从webcrawler搜索查询后的源码html,成功则返回元组(True,有意义数据),否则返回(False,'')
          http://www.webcrawler.com/  """
    global 手机号正则,blacklist
    soup=BeautifulSoup(html,'lxml')
           
    result=soup.find_all('div',{'class':'result'})

    availableitems=[]
    for i in range(0,len(result)):
        item=result[i]
         
        #0:标题,
        titletag=item.find_all('a',{'class':'title'})
        if titletag==[]:continue
        title=titletag[0].text
        if not containchinese(title):continue                 #不包括中文字,忽略
        title=puretitle(title)
        #源网址
        sourceurltag=item.find_all('span',{'class':'url'})
        if sourceurltag==[]:continue
        sourceurl=sourceurltag[0].text
        domain=extractdomain_webcrawler(sourceurl)
        if  domain in blacklist:continue
        #简介与描述
        brieftag=item.find_all('span')
        if brieftag==[]:continue
        brief=brieftag[1].text.strip()
        rs=re.findall(手机号正则,brief)        #我们认为简介中有连续3个以上的手机号,这网站就没有此手机号的相关信息
        if len(rs)>=3:continue
        if not mobile in brief:continue    
        
        availableitems.append((title ,brief,sourceurl))
        break
    if availableitems:
        return (True,(mobile,availableitems[0]))
    else:
        return (False,"")
    

def extractdomain_search(url):
    """从网址提取域名,for search.com"""
    
    linksep=url.split("/")
    if url.startswith("http://") or url.startswith("https://"):
        url=linksep[2]
    else:
        url=linksep[0]
    domain=url
    words=domain.split(".")
    domain=words[-2] + "." + words[-1]
    return domain
def extractfromsearch(mobile,html):
    """分析从search.com搜索查询后的源码html,成功则返回元组(True,有意义数据),否则返回(False,'')
          https://www.search.com/  """
    global 手机号正则,blacklist
    soup=BeautifulSoup(html,'lxml')
           
    result=soup.find_all('div',{'class':'web-result'})

    availableitems=[]
    for i in range(0,len(result)):
        item=result[i]
         
        #0:标题,
        titletag=item.find_all('a',{'class':'web-result-title-link'})        
        if titletag==[]:continue
        title=titletag[0].text
        if not containchinese(title):continue                 #不包括中文字,忽略
        title=puretitle(title)
        #源网址
        sourceurltag=item.find_all('p',{'class':'web-result-url'})
        if sourceurltag==[]:continue
        sourceurl=sourceurltag[0].text

        domain=extractdomain_search(sourceurl)
        if  domain in blacklist:continue
        #简介与描述
        brieftag=item.find_all('p',{'class':'web-result-description'})
        if brieftag==[]:continue
        brief=brieftag[0].text.strip()
        rs=re.findall(手机号正则,brief)        #我们认为简介中有连续3个以上的手机号,这网站就没有此手机号的相关信息
        if len(rs)>=3:continue
        if not mobile in brief:continue
        
        availableitems.append((title ,brief,sourceurl))
        break
    if availableitems:
        return (True,(mobile,availableitems[0]))
    else:
        return (False,"")

def extractdomain_dogpile(url):
    """从网址提取域名,for dogpile.com"""
    
    linksep=url.split("/")
    url=linksep[0]
    
    domain=url
    words=domain.split(".")
    domain=words[-2] + "." + words[-1]
    return domain

def extractfromdogpile(mobile,html):
    """分析从dogpile.com搜索查询后的源码html,成功则返回元组(True,有意义数据),否则返回(False,'')
          https://www.dogpile.com/  """
    global 手机号正则,blacklist
    soup=BeautifulSoup(html,'lxml')
           
    result=soup.find_all('div',{'class':'searchResult webResult'})

    availableitems=[]
    for i in range(0,len(result)):
        item=result[i]
         
        #0:标题,
        titletag=item.find_all('a',{'class':'resultTitle'})        
        if titletag==[]:continue
        title=titletag[0].text
        if not containchinese(title):continue                 #不包括中文字,忽略
        title=puretitle(title)
        #源网址
        sourceurltag=item.find_all('a',{'class':'resultDisplayUrl'})
        if sourceurltag==[]:continue
        sourceurl=sourceurltag[0].text

        domain=extractdomain_dogpile(sourceurl)
        if  domain in blacklist:continue
        #简介与描述
        brieftag=item.find_all('div',{'class':'resultDescription'})
        if brieftag==[]:continue
        brief=brieftag[0].text.strip()
        rs=re.findall(手机号正则,brief)        #我们认为简介中有连续3个以上的手机号,这网站就没有此手机号的相关信息
        if len(rs)>=3:continue
        if not mobile in brief:continue
        
        availableitems.append((title ,brief,sourceurl))
        break
    if availableitems:
        return (True,(mobile,availableitems[0]))
    else:
        return (False,"")
    
def extractdomain_ecosia(url):
    """从网址提取域名,for ecosia.com"""
    linksep=url.split("/")
    if url.startswith("http://") or url.startswith("https://"):
        url=linksep[2]
    else:
        url=linksep[0]

    domain=url
    words=domain.split(".")
    domain=words[-2] + "." + words[-1]
    return domain

def extractfromecosia(mobile,html):
    """分析从ecosia.com搜索查询后的源码html,成功则返回元组(True,有意义数据),否则返回(False,'')
       https://www.ecosia.org/  """
    global 手机号正则,blacklist
    soup=BeautifulSoup(html,'lxml')
           
    result=soup.find_all('div',{'class':'result js-result card-mobile'})

    availableitems=[]
    for i in range(0,len(result)):
        item=result[i]
         
        #0:标题,
        titletag=item.find_all('a',{'class':'result-title js-result-title'})        
        if titletag==[]:continue
        title=titletag[0].text
        if not containchinese(title):continue                 #不包括中文字,忽略
        title=puretitle(title)
        #源网址
        sourceurltag=item.find_all('a',{'class':'result-url js-result-url'})
        if sourceurltag==[]:continue
        sourceurl=sourceurltag[0].text

        domain=extractdomain_ecosia(sourceurl)
        if  domain in blacklist:continue
        #简介与描述
        brieftag=item.find_all('p',{'class':'result-snippet'})
        if brieftag==[]:continue
        brief=brieftag[0].text.strip()
        rs=re.findall(手机号正则,brief)        #我们认为简介中有连续3个以上的手机号,这网站就没有此手机号的相关信息
        if len(rs)>=3:continue
        if not mobile in brief:continue
        
        availableitems.append((title ,brief,sourceurl))
        break
    if availableitems:
        return (True,(mobile,availableitems[0]))
    else:
        return (False,"")

def extractdomain_qwant(url):
    """从网址提取域名,for qwant.com"""
    linksep=url.split("/")
    if url.startswith("http://") or url.startswith("https://"):
        url=linksep[2]
    else:
        url=linksep[0]

    domain=url
    words=domain.split(".")
    domain=words[-2] + "." + words[-1]
    return domain

def extractfromqwant(mobile,html):
    """分析从lite.qwant.com搜索查询后的源码html,成功则返回元组(True,有意义数据),否则返回(False,'')
    https://lite.qwant.com/  """    
    global 手机号正则,blacklist
    soup=BeautifulSoup(html,'lxml')
           
    result=soup.find_all('div',{'class':'result'})
    
    availableitems=[]
    for i in range(0,len(result)):
        item=result[i]
         
        #0:标题,
        titletag=item.find_all('a')      
        if titletag==[]:continue
        title=titletag[0].text
        if not containchinese(title):continue                 #不包括中文字,忽略
        title=puretitle(title)
        #源网址
        sourceurltag=item.find_all('p',{'class':'url'})
        if sourceurltag==[]:continue
        sourceurl=sourceurltag[0].text

        domain=extractdomain_qwant(sourceurl)
        if  domain in blacklist:continue
        #简介与描述
        brieftag=item.find_all('p')
        if brieftag==[]:continue
        brief=brieftag[1].text.strip()
        rs=re.findall(手机号正则,brief)        #我们认为简介中有连续3个以上的手机号,这网站就没有此手机号的相关信息
        if len(rs)>=3:continue
        if not mobile in brief:continue
        
        availableitems.append((title ,brief,sourceurl))
        break
    if availableitems:
        return (True,(mobile,availableitems[0]))
    else:
        return (False,"")

   
def extractmobileinfo(se,mobile,html):
    """se的值可能为'bing','chinaso','360','baidu','sogou','webcrawler','search'
       ,'dogpile','ecosia','qwang'
       返回结果为:(True,(手机号,(标题,简述,源url)))或(False,"")
       
    """
    if se=='bing':
        ret=extractfrombing(mobile,html)
    if se=='chinaso':
        ret=extractfromchinaso(mobile,html)
    if se=='360':
        ret=extractfrom360(mobile,html)
    if se=='baidu':
        ret=extractfrombaidu(mobile,html)
    if se=='sogou':
        ret=extractfromsogou(mobile,html)
    if se=='webcrawler':
        ret=extractfromwebcrawler(mobile,html)
    if se=='search':
        ret=extractfromsearch(mobile,html)
        
    if se=='dogpile':
        ret=extractfromdogpile(mobile,html)
    if se=='ecosia':
        ret=extractfromecosia(mobile,html)
    if se=='qwant':
        ret=extractfromqwant(mobile,html)
        
    return ret

    
if __name__=='__main__':
    pass
    """
    se='360'
    mobilehead='137146809'
    for i in range(83,100):
        mobile=mobilehead + str(i)
        url="https://www.so.com/s?q=" + mobile
        print("搜索:" + url )
        html=geturltext(url)          #得到源码
        f=open("C:\\"+ se + "_" +  mobile + ".html",mode='w',encoding='utf-8')
        f.write(html)
        f.close()
        print(extractmobileinfo(se,mobile,html))
        print()
     """


    

 

 

李兴球

李兴球的博客是Python创意编程原创博客

评论已关闭。