Python通过关键词下载百度图片(风火轮编程内部简易爬虫教学程序)

Python通过关键词下载百度图片(风火轮编程内部简易爬虫教学程序)

"""Python通过关键词下载百度图片.py 双击本程序,输入关键词,会下载一些图片,这些图片存放在以关键词为文件夹名称的目录中。"""

import requests
from tools import * 

def dump(filecontent,filename):
    """写入到文件里"""
    f = open(filename,mode='wb') 
    f.write(filecontent)         
    f.close()     

def download(keyword,link):
    """下载链接保存为图片,保存到名为keywords的文件夹中"""
    try: 
       resp = requests.get(link,timeout=5, verify=True)          
       img = resp.content        
       dump(img, random_file_name(keyword))
    except:
       pass     

if __name__ == "__main__":
    
    baidu_image_search = "http://image.baidu.com/search/index?tn=baiduimage&ie=utf-8&word="

    keyword = input("请输入关键词:\n")

    search_url = baidu_image_search + keyword

    links = get_all_links(search_url)

    for link in links:
        print("当前下载:",link,"\n")
        download(keyword,link)
#
#风火轮少儿编程内部简易爬虫教学程序初稿
#下面是tools模块的代码,此模块不做教学,转载请注明出处。
import requests,re
from random import choice
from time import time 
import os

def random_file_name(keywords):
    """以关键词为文件夹名,生成随机文件名"""
    folder = os.getcwd() + os.sep + keywords
    if not os.path.exists(folder) : os.mkdir(folder)
    字符集 = 'abcdefghijklmnopqrstuvwxyz01234567890'
    filename="".join([ choice(字符集) for _ in range(3)]) 
    s=str(time())
    s=s.split(".")[-1]
    return folder + os.sep + s + filename + ".jpg"


def get_all_links(link):
    """收集一个url页面的所有链接,返回到列表"""
                             
    urls = set()
    headers = {'Accept': 'text/html, application/xhtml+xml, image/jxr, */*',
               'Accept - Encoding':'gzip, deflate',
               'Accept-Language':'zh-Hans-CN, zh-Hans; q=0.5',
               'Connection':'Keep-Alive',               
               'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'}
   
    resp = requests.get(link,headers = headers,timeout=5, verify=True)
    html = resp.text
    for i in range(5):
        html = html.replace("http://img" + str(i) + ".imgtn.bdimg.com","https://ss1.bdstatic.com/70cFuXSh_Q1YnxGkpoWK1HF6hhy")     
    regex = re.compile("https://ss[0-3].bdstatic.com/\w+/it/u=\d+,\d+&fm=\d+&gp=0\.jpg")
    links = re.findall(regex,html)
   
    return set(links)


if __name__ == "__main__":

    keywords = "风火轮少儿编程"
    print(random_file_name(keywords))







 

李兴球

李兴球的博客是Python创意编程原创博客