"""
简易程序抓取911cha.com上面的动词表到文件中
这是由于在创造一个scratch作品时需要用到这些数据,
所以编制了这个程序。关键词:数据采集,爬虫
"""
import time
import requests
from lxml import html
headers = {'Accept': 'text/html, application/xhtml+xml, image/jxr, */*',
'Accept - Encoding':'gzip, deflate',
'Accept-Language':'zh-Hans-CN, zh-Hans; q=0.5',
'Connection':'Keep-Alive',
'Referer':'http://www.scratch8.net',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'}
url = 'https://cidian.911cha.com/cixing_dongci_p'
urls = [f"{url}{i}.html" for i in range(1,21)]
动词表 = set()
for url in urls:
time.sleep(1)
page = requests.get(url,headers=headers) #请求.得到
tree= html.fromstring(page.content) n #返回 lxml.html.HtmlElement ,生成html元素 树
for a in tree.xpath("//ul[@class='l3']//li/a[1]"):
动词表.add(a.text)
print(url,'处理完毕。')
f = open("动词表.txt",mode='w',encoding='utf-8')
for n in 动词表:
f.write(n + "\n")
f.close()
print('共写入',len(动词表),'个名词')