神猫链接采集器测试文档代码
"""
标题:神猫链接采集器,英文名 catLinkPicker
描述:输入命令行参数为,域名 正则 初始链接列表文件名
针对某些特征网站需要具体问题具体分析,再更改代码,或用selenium翻页采集链接,效果更好.
特别是对于那些直接网址直接是以诸如 “aspx?id=xxxx”或 “php?id=yyyy”结尾的的网页就不必采集了,这种网页是直接查询数据库。
找出最小的id和最大的id,做个列表导出到文件即可。
如果需要中止while循环,新建一个文件名为'中止.txt'的空文件即可.
作者:李兴球
日期:2018/2/28
"""
from lxml import html
import requests
from time import ctime,time
from random import choice
import os
import re
import sys
from urllib.parse import urljoin #用于转换url的相对路径与绝对路径
def loadInitUrl(fileName):
"""从文件中加载种子链接,返回集合"""
print("\n加载初始网址...\n")
oneSet=set()
try:
f = open(fileName)
for link in f:
oneSet.add(link.strip())
f.close()
except:
pass
return oneSet
def writeToFile(aset,regex,fileName):
"""把aset集合的符合正则表达式的链接写入fileName中,"""
f = open(fileName,mode='w')
counter = 0
for link in aset:
if re.match(regex,link):
print("发现一个符合要求的链接:",link)
f.write(link + "\n")
counter = counter + 1
f.close()
print(" 本次共有",counter,"个符合正则表达式的链接写入",fileName,"中")
def smartRequest(url,encode):
"""下载网页源码的函数
请求头,也可以多用不同的浏览器抓一些,本程序暂不轮换header"""
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9",
"Connection": "keep-alive",
"Cookie":"_ga=GA1.2.1707472291.1518440445; _gid=GA1.2.879678725.1518440445; aliyungf_tc=AQAAAG899EWo7QwAkfeqdbtP7US/hKO+; SERVER_ID=7a2a6789-c873f3e2; Hm_lvt_05d39f4d0b6d45b03bf3bb358aba968a=1518440459,1518485016; Hm_lvt_74489c025adf11db1de5f58194b93d62=1518440494,1518491019; Hm_lpvt_74489c025adf11db1de5f58194b93d62=1518492750; Hm_lpvt_05d39f4d0b6d45b03bf3bb358aba968a=1518570483",
"Upgrade-Insecure-Requests":"1",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
}
page = requests.models.Response() #空的requests响应对象
try:
page = requests.get(url,headers=headers, timeout=10)
page.encoding=encode
except:
print("@",ctime(),"访问",url,"发生错误")
return page
def collectLinksFrom(configList):
domain = configList['domain']
regex = configList['regex']
fileName = configList['filename']
linkSet1 = loadInitUrl(fileName) #从文件中加载初始URL
if len(linkSet1)==0:
try:
requests.get("http://www." + domain)
linkSet1.add("http://www." + domain)
except:
pass
try:
requests.get("https://www." + domain,timeout=3)
linkSet1.add("https://www." + domain)
except:
pass
计数器=0
linkSet2 = set()
allLinks = set() #linkSet1.copy() #allLinks集合存所有的链接
startTime = time()
正则=re.compile(regex)
selectedLinks=set() #符合正则的链接集合
运行 =True
print("初始网址链接数:",len(linkSet1))
while 运行:
for oneurl in linkSet1:
#中止条件成立即退出循环,要中止这个死循环,新建一个文件名为'中止.txt'的空文件.
持续时间 = int(time() - startTime)
if 持续时间 % 10 ==0 : #每10秒检测一次
if os.path.exists("中止.txt"):
运行= False
break
#中止条件代码段结束.
try:
page = requests.get(oneurl) #请求.得到源代码
#print(page.text)
tree= html.fromstring(page.content) #返回 lxml.html.HtmlElement ,生成html元素 树
except:
print(oneurl,"出错了")
continue
for link in tree.xpath("//a"): # //a[contains(@href,'" + domain + "')]有些网站很多相对链接, 这时大部分链接就获取不到了。
if not ('href' in link.attrib.keys()):continue
newUrl = link.attrib['href'] #获取链接
newUrl = newUrl.strip()
newUrl = urljoin(oneurl, newUrl) #转换相对路径到绝对路径
if (domain in newUrl ) and (not (newUrl in allLinks)): #发现新链接
print(newUrl)
linkSet2.add(newUrl)
allLinks.add(newUrl)
计数器= 计数器 + 1
if len(allLinks) % 20 ==0:
print(domain,"链接采集中," , len(allLinks),"个链接@",ctime()," #中止本程序请新建名为'中止.txt'的文件\n")
writeToFile(allLinks,正则,fileName) #符合正则的则写入文件中
linkSet1=linkSet2 #linkSet1指向linkSet2的内容
linkSet2=set() #对linkSet2进行清空
print("\n发现中止条件,安全着陆")
print(len(allLinks),"个链接@",ctime())
writeToFile(allLinks,正则,fileName)
if __name__=="__main__":
configList=dict()
print("神猫链接采集器.\n")
if len(sys.argv)<2:
print("没有输入命令行参数,启用测试模式。")
configList['domain']="nkjfrc.com"
configList['regex']="http://www.nkjfrc.com/ResumeShow.aspx"
configList['filename']="链接表.txt"
else:
if (len(sys.argv)==4):
configList['domain'] = sys.argv[1]
configList['regex'] = sys.argv[2]
configList['filename'] = sys.argv[3]
else:
selfName = configList[0].split(".")[0]
print("参数错误,形式为:")
print(selfName," 域名 正则表达式 文件名")
collectLinksFrom(configList)
发表评论