一:腾讯动漫爬虫代码

import requests
from fake_useragent import UserAgent
import re
import threadingdef replace_mark(my_str):return my_str.replace(",", ",").replace('"', "“")def format_html(html):li_pattern = re.compile('<li\sclass="ret-search-item clearfix">[\s\S]+?</li>')title_url_pattern = re.compile('<a href="(.*?)" target="_blank" title=".*?">(.*?)</a>')sign_pattern = re.compile('<i class="ui-icon-sign">签约</i>')exclusive_pattern = re.compile('<i class="ui-icon-exclusive">独家</i>')author_pattern = re.compile('<p class="ret-works-author" title=".*?">(.*?)</p>')tags_pattern = re.compile('<span href=".*?" target="_blank">(.*?)</span>')score_pattern = re.compile('<span>人气:<em>(.*?)</em></span>')items = li_pattern.findall(html)for item in items:title_url = title_url_pattern.search(item)title = title_url.group(2)url = title_url.group(1)sign = 0exclusive = 0if sign_pattern.search(item) is not None:sign = 1if exclusive_pattern.search(item) is not None:exclusive = 1author = author_pattern.search(item).group(1)tags = tags_pattern.findall(item)score = score_pattern.search(item).group(1)lock.acquire()with open("./qq.csv", "a+", encoding="utf-8") as f:f.write(f'{replace_mark(title)},{url},{sign},{exclusive},{replace_mark(author)},{"#".join(tags)},"{replace_mark(score)}" \n')lock.release()def run(index):ua = UserAgent(use_cache_server=False)response = requests.get(f"https://ac.qq.com/Comic/index/page/{index}", headers={'User-Agent': ua.random})html = response.textformat_html(html)semaphore.release()lock = threading.Lock()
if __name__ == "__main__":num = 0semaphore = threading.BoundedSemaphore(5)lst_record_threads = []for index in range(1, 462):print(f"正在抓取{index}")semaphore.acquire()t = threading.Thread(target=run, args=(index, ))t.start()lst_record_threads.append(t)for rt in lst_record_threads:rt.join()print("数据爬取完毕")

二:19lou相亲爬虫

import requests
from lxml import etree
from fake_useragent import UserAgent
import timedef save(src, title):try:res = requests.get(src)with open(f"imgs/{title}.jpg", "wb+") as f:f.write(res.content)except Exception as e:print(e)def run(url):# ua = UserAgent(cache=False)ua = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36"headers = {"User-Agent": ua,"Host": "www.19lou.com","Referer": "https://www.19lou.com/r/1/19lnsxq-233.html","Cookie": "_Z3nY0d4C_=37XgPK9h"  # 从反爬代码中获取到的值}try:res = requests.get(url=url, headers=headers)text = res.text# 将 html 转换成 Element 对象html = etree.HTML(text)# xpath 路径提取 @class 为选取 class 属性divs = html.xpath("//div[@class='pics']")# print(len(divs))# 遍历 Elements 节点for div in divs:# 提取地址,注意提取的属性为 data-src 而不是 srcsrc = div.xpath("./img/@data-src")[0]# 提取标题title = div.xpath("./img/@alt")[0]save(src, title)except Exception as e:print(e)if __name__ == '__main__':urls = ["https://www.19lou.com/r/1/19lnsxq.html"]for i in range(114, 243):urls.append(f"https://www.19lou.com/r/1/19lnsxq-{i}.html")for url in urls:print(f"正在抓取{url}")run(url)# time.sleep(5)print("全部爬取完毕")

三:小吃加盟店数据

import requests
from lxml import etree
import time
import re
import randomUSER_AGENTS = ["Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)","Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)","Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)","Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)","Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)","Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)","Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)","Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)","Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6","Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1","Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0","Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5","Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11","Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20","Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
]def run(url, index):try:headers = {"User-Agent": random.choice(USER_AGENTS)}res = requests.get(url=url, headers=headers)res.encoding = "utf-8"html = res.textwith open(f"./html/{index}.html", "w+", encoding="utf-8") as f:f.write(html)except Exception as e:print(e)def list_str(my_list):return ",".join(my_list)def get_data():for i in range(1, 130):with open(f"./html/{i}.html", "r", encoding="utf-8") as f:html = f.read()element = etree.HTML(html)# contains 函数 获取包含 xxx 的元素,类似的还有 starts-with,ends-with,lastorigin_li = element.xpath("//ul[contains(@class,'xm-list')]/li")# 循环抓取 li 内部数据for item in origin_li:# 提取加盟名称# title = item.xpath(".//div[@class='r']/h4/text()")[0]title = item.xpath("./div[@class='top']/a/@title")[0]# 提取超链接detail_link = "http:" + item.xpath("./div[@class='top']/a/@href")[0]# 提取特殊标签special_tag = list_str(item.xpath("./@class"))# 当包含特殊标签 xm-list2 时,使用不同的提取规则if special_tag != "xm-list2":# 提取标签tags = "#".join(item.xpath(".//div[@class='bot']/span[@class='label']/text()"))# 提取投资价格price = list_str(item.xpath(".//div[@class='bot']/span[@class='money']/b/text()"))# 地址和行业city_industry = list_str(item.xpath("./div[@class='bot']/p/span/text()"))long_str = f"{title},{detail_link}, {tags}, {price}, {city_industry}"save(long_str)else:# 地址和行业city_industry = list_str(item.xpath("./div[@class='top']/a/div/p[2]/span/text()"))long_str = f"{title},{detail_link}, {0}, {0}, {city_industry}"save(long_str)def save(long_str):try:with open(f"./jiameng.csv", "a+",encoding="utf-8") as f:f.write("\n"+long_str)except Exception as e:print(e)if __name__ == '__main__':# for i in range(1, 130):#     print(f"正在爬取第{i}页数据")#     run(f"https://www.3158.cn/xiangmu/canyin/?pt=all&page={i}", i)get_data()print("全部提取完毕")

四:音频爬虫

import timeimport asyncio
import aiohttpfrom bs4 import BeautifulSoup
import lxmlasync def get_html(client, url):print("正在采集", url)async with client.get(url, timeout=5) as resp:html = await resp.text()soup = BeautifulSoup(html, 'lxml')divs = soup.find_all(attrs={'class': 'img_mini'})mp3_urls = [get_mp3_url("https://www.bensound.com/" + div.a.img["src"]) for div in divs]return mp3_urlsdef get_mp3_url(img_url):img_url = img_urlname = img_url[img_url.rfind("/") + 1:img_url.rfind(".")]mp3_url = f"https://www.bensound.com/bensound-music/bensound-{name}.mp3"return mp3_urlasync def get_mp3_file(client, url):print("正在采集 mp3 文件", url)headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36","Referer": "https://www.bensound.com/royalty-free-music"}mp3_file_name = url[url.rfind('-') + 1:url.rfind('.')]print(mp3_file_name)async with client.get(url, headers=headers) as resp:content = await resp.read()with open(f'./mp3/{mp3_file_name}.mp3', "wb") as f:f.write(content)return (url, "success")async def main(urls):timeout = aiohttp.ClientTimeout(total=600)  # 将超时时间设置为600秒connector = aiohttp.TCPConnector(limit=50)  # 将并发数量降低async with aiohttp.ClientSession(connector=connector, timeout=timeout) as client:tasks = [asyncio.ensure_future(get_html(client, urls[i])) for i in range(len(urls))]dones, pendings = await asyncio.wait(tasks)print("异步执行完毕,开始输出对应结果:")all_mp3 = []for task in dones:all_mp3.extend(task.result())totle = len(all_mp3)print("累计获取到【", totle, "】个 MP3 文件")print("_" * 100)print("准备下载 MP3 文件")# 每次下载10个totle_page = totle // 10 if totle % 10 == 0 else totle // 10 + 1for page in range(0, totle_page):print("正在下载第{}页 MP3 文件".format(page + 1))start_page = 0 if page == 0 else page * 10end_page = (page + 1) * 10print("待下载地址")print(all_mp3[start_page:end_page])mp3_download_tasks = [asyncio.ensure_future(get_mp3_file(client, url)) for url inall_mp3[start_page:end_page]]mp3_dones, mp3_pendings = await asyncio.wait(mp3_download_tasks)for task in mp3_dones:print(task.result())if __name__ == '__main__':url_format = "https://www.bensound.com/royalty-free-music/{}"urls = [url_format.format(i) for i in range(1, 5)]start_time = time.perf_counter()asyncio.run(main(urls))print("aiohttp 异步采集消耗时间为:", time.perf_counter() - start_time)

五:手机壁纸抓取

import requests
import re
import threadingheaders = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36"}# 循环获取 URLdef get_image(base_url):res = requests.get(url=base_url, headers=headers)if res is not None:html = res.textpattern = re.compile('<img lazysrc="(.*?)" lazysrc2x=".*?" width="221" height="362" alt=".*?" title="(.*?)"')match_list = pattern.findall(html)for url, title in match_list:save_image(url[:url.find('jpg')+3], title)print(match_list)def save_image(url, title):try:print(f"{title} - {url}")res = requests.get(url=url, headers=headers)if res is not None:html = res.contentwith open(f"images/{title}.jpg", "wb+") as f:f.write(res.content)except Exception as e:print(e)if __name__ == '__main__':num = 0# 最多开启5个线程semaphore = threading.BoundedSemaphore(5)for index in range(189):t = threading.Thread(target=get_image, args=(f"https://www.3gbizhi.com/sjbz/index_{index}.html",))t.start()while threading.active_count() != 1:passelse:print('所有线程运行完毕')

一、什么是爬虫

爬虫:一段自动抓取互联网信息的程序,从互联网上抓取对于我们有价值的信息。

二、Python爬虫架构

Python 爬虫架构主要由五个部分组成,分别是调度器、URL管理器、网页下载器、网页解析器、应用程序(爬取的有价值数据)。

  • 调度器:相当于一台电脑的CPU,主要负责调度URL管理器、下载器、解析器之间的协调工作。
  • URL管理器:包括待爬取的URL地址和已爬取的URL地址,防止重复抓取URL和循环抓取URL,实现URL管理器主要用三种方式,通过内存、数据库、缓存数据库来实现。
  • 网页下载器:通过传入一个URL地址来下载网页,将网页转换成一个字符串,网页下载器有urllib2(Python官方基础模块)包括需要登录、代理、和cookie,requests(第三方包)
  • 网页解析器:将一个网页字符串进行解析,可以按照我们的要求来提取出我们有用的信息,也可以根据DOM树的解析方式来解析。网页解析器有正则表达式(直观,将网页转成字符串通过模糊匹配的方式来提取有价值的信息,当文档比较复杂的时候,该方法提取数据的时候就会非常的困难)、html.parser(Python自带的)、beautifulsoup(第三方插件,可以使用Python自带的html.parser进行解析,也可以使用lxml进行解析,相对于其他几种来说要强大一些)、lxml(第三方插件,可以解析 xml 和 HTML),html.parser 和 beautifulsoup 以及 lxml 都是以 DOM 树的方式进行解析的。
  • 应用程序:就是从网页中提取的有用数据组成的一个应用。

下面用一个图来解释一下调度器是如何协调工作的:

python爬虫五大实例,爬虫初始与PyCharm安装-编程知识网

三、urllib2 实现下载网页的三种方式

#!/usr/bin/python # -*- coding: UTF-8 -*- import cookielib import urllib2 url = "http://www.baidu.com" response1 = urllib2.urlopen(url) print "第一种方法" #获取状态码,200表示成功 print response1.getcode() #获取网页内容的长度 print len(response1.read()) print "第二种方法" request = urllib2.Request(url) #模拟Mozilla浏览器进行爬虫 request.add_header("user-agent","Mozilla/5.0") response2 = urllib2.urlopen(request) print response2.getcode() print len(response2.read()) print "第三种方法" cookie = cookielib.CookieJar() #加入urllib2处理cookie的能力 opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie)) urllib2.install_opener(opener) response3 = urllib2.urlopen(url) print response3.getcode() print len(response3.read()) print cookie

四、第三方库 Beautiful Soup 的安装

Beautiful Soup: Python 的第三方插件用来提取 xml 和 HTML 中的数据,官网地址 Beautiful Soup: We called him Tortoise because he taught us.

1、安装 Beautiful Soup

打开 cmd(命令提示符),进入到 Python(Python2.7版本)安装目录中的 scripts 下,输入 dir 查看是否有 pip.exe, 如果用就可以使用 Python 自带的 pip 命令进行安装,输入以下命令进行安装即可:

pip install beautifulsoup4

2、测试是否安装成功

编写一个 Python 文件,输入:

import bs4
print bs4

运行该文件,如果能够正常输出则安装成功。

五、使用 Beautiful Soup 解析 html 文件

#!/usr/bin/python # -*- coding: UTF-8 -*- import re from bs4 import BeautifulSoup html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="title"><b>The Dormouse's story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">…</p> """ #创建一个BeautifulSoup解析对象 soup = BeautifulSoup(html_doc,"html.parser",from_encoding="utf-8") #获取所有的链接 links = soup.find_all('a') print "所有的链接" for link in links: print link.name,link['href'],link.get_text() print "获取特定的URL地址" link_node = soup.find('a',href="http://example.com/elsie") print link_node.name,link_node['href'],link_node['class'],link_node.get_text() print "正则表达式匹配" link_node = soup.find('a',href=re.compile(r"ti")) print link_node.name,link_node['href'],link_node['class'],link_node.get_text() print "获取P段落的文字" p_node = soup.find('p',class_='story') print p_node.name,p_node['class'],p_node.get_text()

PyCharm

PyCharm 是由 JetBrains 打造的一款 Python IDE。

PyCharm 具备一般 Python IDE 的功能,比如:调试、语法高亮、项目管理、代码跳转、智能提示、自动完成、单元测试、版本控制等。

另外,PyCharm 还提供了一些很好的功能用于 Django 开发,同时支持 Google App Engine,更酷的是,PyCharm 支持 IronPython。

PyCharm 官方下载地址:Download PyCharm: Python IDE for Professional Developers by JetBrains

效果图查看:

python爬虫五大实例,爬虫初始与PyCharm安装-编程知识网


Sublime Text

Sublime Text 具有漂亮的用户界面和强大的功能,例如代码缩略图,Python 的插件,代码段等。还可自定义键绑定,菜单和工具栏。

Sublime Text 的主要功能包括:拼写检查,书签,完整的 Python API , Goto 功能,即时项目切换,多选择,多窗口等等。

Sublime Text 是一个跨平台的编辑器,同时支持 Windows、Linux、Mac OS X等操作系统。

python爬虫五大实例,爬虫初始与PyCharm安装-编程知识网

使用Sublime Text 2的插件扩展功能,你可以轻松的打造一款不错的 Python IDE,以下推荐几款插件(你可以找到更多):

  • CodeIntel:自动补全+成员/方法提示(强烈推荐)
  • SublimeREPL:用于运行和调试一些需要交互的程序(E.G. 使用了Input()的程序)
  • Bracket Highlighter:括号匹配及高亮
  • SublimeLinter:代码pep8格式检查

Eclipse+Pydev

1、安装Eclipse

Eclipse可以在它的官方网站Eclipse.org找到并下载,通常我们可以选择适合自己的Eclipse版本,比如Eclipse Classic。下载完成后解压到到你想安装的目录中即可。

当然在执行Eclipse之前,你必须确认安装了Java运行环境,即必须安装JRE或JDK,你可以到(http://www.java.com/en/download/manual.jsp)找到JRE下载并安装。

2、安装Pydev

运行Eclipse之后,选择help–>Install new Software,如下图所示。

python爬虫五大实例,爬虫初始与PyCharm安装-编程知识网

点击Add,添加pydev的安装地址:http://pydev.org/updates/,如下图所示。

python爬虫五大实例,爬虫初始与PyCharm安装-编程知识网

完成后点击"ok",接着点击PyDev的"+",展开PyDev的节点,要等一小段时间,让它从网上获取PyDev的相关套件,当完成后会多出PyDev的相关套件在子节点里,勾选它们然后按next进行安装。如下图所示。

python爬虫五大实例,爬虫初始与PyCharm安装-编程知识网

安装完成后,重启Eclipse即可

3、设置Pydev

安装完成后,还需要设置一下PyDev,选择Window -> Preferences来设置PyDev。设置Python的路径,从Pydev的Interpreter – Python页面选择New

python爬虫五大实例,爬虫初始与PyCharm安装-编程知识网

会弹出一个窗口让你选择Python的安装位置,选择你安装Python的所在位置。

python爬虫五大实例,爬虫初始与PyCharm安装-编程知识网

完成之后PyDev就设置完成,可以开始使用。

4、建立Python Project:

安装好Eclipse+PyDev以后,我们就可以开始使用它来开发项目了。首先要创建一个项目,选择File -> New ->Pydev Project

python爬虫五大实例,爬虫初始与PyCharm安装-编程知识网

会弹出一个新窗口,填写Project Name,以及项目保存地址,然后点击next完成项目的创建。

python爬虫五大实例,爬虫初始与PyCharm安装-编程知识网

5、创建新的Pydev Module

光有项目是无法执行的,接着必须创建新的Pydev Moudle,选择File -> New -> Pydev Module

python爬虫五大实例,爬虫初始与PyCharm安装-编程知识网

在弹出的窗口中选择文件存放位置以及Moudle Name,注意Name不用加.py,它会自动帮助我们添加。然后点击Finish完成创建。

python爬虫五大实例,爬虫初始与PyCharm安装-编程知识网

输入"hello world"的代码。

python爬虫五大实例,爬虫初始与PyCharm安装-编程知识网

6、执行程序

程序写完后,我们可以开始执行程序,在上方的工具栏上面找到执行的按钮。

python爬虫五大实例,爬虫初始与PyCharm安装-编程知识网

之后会弹出一个让你选择执行方式的窗口,通常我们选择Python Run,开始执行程序。

python爬虫五大实例,爬虫初始与PyCharm安装-编程知识网