""" 简易程序抓取911cha.com上面的动词表到文件中 这是由于在创造一个scratch作品时需要用到这些数据, 所以编制了这个程序。关键词:数据采集,爬虫 """ import time import requests from lxml import html headers = {'Accept': 'text/html, application/xhtml+xml, image/jxr, */*', 'Accept - Encoding':'gzip, deflate', 'Accept-Language':'zh-Hans-CN, zh-Hans; q=0.5', 'Connection':'Keep-Alive', 'Referer':'http://www.scratch8.net', 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'} url = 'https://cidian.911cha.com/cixing_dongci_p' urls = [f"{url}{i}.html" for i in range(1,21)] 动词表 = set() for url in urls: time.sleep(1) page = requests.get(url,headers=headers) #请求.得到 tree= html.fromstring(page.content) n #返回 lxml.html.HtmlElement ,生成html元素 树 for a in tree.xpath("//ul[@class='l3']//li/a[1]"): 动词表.add(a.text) print(url,'处理完毕。') f = open("动词表.txt",mode='w',encoding='utf-8') for n in 动词表: f.write(n + "\n") f.close() print('共写入',len(动词表),'个名词')