mangabz漫画爬虫
遇到的问题(未解决)
1:多线程导致开启多个chrome内存溢出,或者chrome崩溃
2:下载不全需执行多次补全漫画
3:日志打印混乱

import requests
import fake_useragent
import time
from lxml import etree
import os
from selenium import webdriver
from selenium.webdriver.support import wait
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.common.by import By
from multiprocessing import Pool,Process
from selenium.common.exceptions import TimeoutExceptionheader = {'useragent': fake_useragent.UserAgent().random,
}def get_pic(src,ipg,dir_name):# response = requests.get(url=src[0])# print(response)if not os.path.exists('./pic/%s' %dir_name):os.mkdir('./pic/%s' %dir_name)pic_name = src[0].split('?')[0].split('/')[-1]if not os.path.exists(os.getcwd()+'/pic/%s'  %dir_name+'/'+pic_name):print('开始下载', src)response = requests.get(url=src[0])with open('./pic/%s/' %dir_name+pic_name,'wb') as ft:ft.write(response.content)ft.close()print('下载第%02d页成功!!!' %ipg)else:print('已经下载过了')
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--disable-gpu')def get_pic_url(href,dir_name,ipg):# options = webdriver.ChromeOptions()# options.add_argument('--headless')# options.add_argument('--disable-gpu')borwser = webdriver.Chrome(options=options,executable_path='E:\py\chromedriver.exe')href=href+'#ipg%s' %ipgtry:borwser.get(href)wait.WebDriverWait(borwser,15,1).until(ec.presence_of_element_located((By.ID,'cp_image')))except TimeoutException:print('————————————————————————————请求超时')borwser.get(href)wait.WebDriverWait(borwser, 30, 1).until(ec.presence_of_element_located((By.ID, 'cp_image')))page = borwser.page_sourceet = etree.HTML(page)src = et.xpath('//img[@id="cp_image"]/@src')print("正在获取第%02d页" % ipg, src)get_pic(src, ipg, dir_name)print('再次下载——————————————————————————————')page = borwser.page_sourceet = etree.HTML(page)src = et.xpath('//img[@id="cp_image"]/@src')print("正在获取第%02d页" %ipg,src)# borwser.delete_all_cookies()borwser.close()borwser.quit()get_pic(src,ipg,dir_name)def get_page():url = 'http://www.mangabz.com/73bz/'respones = requests.get(url=url,headers=header)page = respones.textet = etree.HTML(page)a_list = et.xpath('//div[@id="chapterlistload"]/a')for a in a_list:href = 'http://www.mangabz.com/'+a.xpath('./@href')[0]dir_name = a.xpath('./text()')[0].strip()ipg = a.xpath('./span/text()')[0]print(href,dir_name,ipg)ipg = filter(str.isdigit, ipg)ipg = list(ipg)ipg = ''.join(ipg)ipg = ipg.replace(' ','')ipg = int(ipg)print('开始下载第%s' %dir_name)path = r"E:\py\manhua/pic/%s/" %dir_nameif os.path.exists(path):dir_num = len([lists for lists in os.listdir(path) if os.path.isfile(os.path.join(path, lists))])if dir_num == ipg-1:print('%s本章已下载' %dir_name)continueelse:pool = Pool(16)for page in range(1,ipg):# print('____________',page)# p = Process(target=get_pic_url,args=(href,dir_name,page))# p.start()# time.sleep(2)p = pool.apply_async(get_pic_url,(href,dir_name,page))p.wait()#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~pool.close()else:pool = Pool(32)for page in range(1, ipg):# print('____________',page)# p = Process(target=get_pic_url,args=(href,dir_name,page))# p.start()# time.sleep(2)p = pool.apply_async(get_pic_url, (href, dir_name, page))p.wait()pool.close()# 清除一切!!!!!!!!!!!!!!!# try:#     os.system("taskkill /f /im chromedriver.exe /t")#     # os.system("taskkill /f /im python.exe /t")# except:#     passif __name__ == '__main__':get_page()# if __name__ == '__main__':# pool = Pool(8)# pool.map(get_pic_url,range(1,21))