【python】图虫网站爬虫分享

爬取图片系列
本次加上了爬取记录,方便断点续爬

import os,threading
import re
import json
import requests
import time
import urllib.parse
import random
import urllib
task = []
head = {'Accept': '*/*',
               'Accept-Language': 'en-US,en;q=0.8',
               'Cache-Control': 'max-age=0',
               'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36',
               'Connection': 'keep-alive',
              }
print('✨'*30)
print('          ψ(*`ー´)ψ')
print('')
print(' .'*15) 
print('           图虫爬虫') 
print(' .'*15)
print('')
print('       版本号:2019.8.04')
print('           (•́へ•́╬)')
 
print('✨'*30)
 
name  = 0
 
ask = input('是否要从上次退出的地方开始? y/n:')
if ask == 'y':
    f = open("爬虫断点记录.txt","a")
    f.write(' ')
    f.close
 
    f= open("爬虫断点记录.txt","r")
    page = f.read()
    f.close
    print('检测到上次爬取到第'+str(page)+'页,将继续')
    print('务必输入上次爬取的关键词')
else:
    page = 1
 
 
 
 
def make_url(theme,page_sum):
    url = "https://tuchong.com/rest/tags/%(theme)s/posts?page=%(pagenum)s&count=20&order=weekly" % {'theme': urllib.parse.quote(theme), 'pagenum': page_sum}
    return url
#利用requests使用get方法使用链接
 
def get_url(url):#用requests.get 解析 url
    try:
        r = requests.get(url, headers=head, timeout=30)
        r.raise_for_status() 
        return r
    except:
        pass
        print('?????')
 
def cantclick (url): #解析那个不能右键的页面//注意:总页码输入过多会报错!
     
    a = requests.get(url)
    b = json.loads(a.text) 
    abc = []
    for i in b['postList']:
        abc.append(i['url'])
        # print(abc)
    return abc
 
 
def get_jpg(url):#顾名思义,获取jpg的url
    try:
        html = get_url(url)
        url_list = list(re.findall('<img id="image\d+" class="multi-photo-image" src="([a-zA-z]+://[^\s]*)" alt="">', html.text))
        return url_list
    except:
        pass
 
#定义下载器模块
def download(theme,n):
    global name,task
     
    while True:
        try:
            url = task.pop()
        
        except:
            time.sleep(5)
            print('图片列表空')
             
        name += 1 #计数器
        name2 = url.split('/')[-1] #命名
        download_file = os.path.join(theme,name2) #跨平台兼容 融合文件夹路径+图片名
        # print(url)
        print('列表任务数:%s' % len(task))
        try:
            print("正在下载第%d套图,一共%d套图" % (n, len(cantclick_list)))
            urllib.request.urlretrieve(url,filename=download_file)
             
            print('共下载了'+str(name)+'张')
            print('---'*10)
        except:
            print('下载失败')
            pass     
 
def run():
    global n,m,page_sum,cantclick_list
    while True:
        n = 0
        m += 1
        page_sum += 1
        print("正在下载第%d页,共%d页" % (m, page_sum))
         
        f = open("爬虫断点记录.txt","w")
        f.write(str(page_sum))
        print('进度已储存')
        f.close
        url = make_url(theme,page_sum)
         
        cantclick_list = cantclick(url)
        if len(cantclick_list) == 0:
            print('爬取结束') 
            break
 
         
        for ii in cantclick_list:
            jpg_list = get_jpg(ii)
            n += 1
            for iii in jpg_list:
                task.append(iii)
                # print(iii)
                 
if __name__ == '__main__':
    theme = input("输入你想看的类型,如 人像 人文 私房(误) 死库水(大误) 或其他:")
    page_sum = int(page)
    folder = os.getcwd()+'/'+theme # xxx = os.getcwd() #获取当前绝对路径 # + 编写目录
    test=os.path.exists(folder)
    if not test:
            # 如果不存在则创建目录
            # 创建目录操作函数
            os.mkdir(folder)            #--------------------创建theme文件夹
     
            print('初始化...')
             
    else:
            # 如果目录存在则不创建,并提示目录已存在
            print('初始化完成...')
    
    m = 0
 
                 
                # time.sleep(random.randint(1, 4))  #防反爬,视情况调整
 
v = input('输入线程数量(视网络情况定,默认30): ')
if v == '':
    v = 30
else:
    v = int(v)
print('即将释放爬虫数:%s' % v)
threading.Thread(target=run).start()
print('正在分配任务...等待时间长则说明网络状态差,报错请减少线程数量') 
while len(task) <= 100:
    time.sleep(0.3)
    print('爬虫任务预加载:%s%%' % len(task))
else:
    for iv in range(v):#最大线程数,可根据网络情况修改,线程总工作速度大于任务分配速度则报错
        threading.Thread(target=download,args=(theme,n)).start()
        print('下载线程-%s启动'%iv)
     
print('https://github.com/gitxxp')
© 版权声明
THE END
喜欢就支持以下吧
点赞0赞赏
分享
评论 抢沙发

请登录后发表评论