1 urllib.request模块
1.1 版本
python2 :urllib2、urllib
python3 :把urllib和urllib2合并,urllib.request
1.2 常⽤的⽅法
urllib.request.urlopen(“⽹址”) 作⽤ :向⽹站发起⼀个请求并获取响应
import urllib.request
# response是响应对象
response = urllib.request.urlopen('https://www.duitang.com/')
# read()把相应对象里面的内容读取出来
print(response.read())
结果输出一系列字节流
- encode() 字符串–> 转换为bytes数据类型
- decode() bytes数据类型–> 转换为字符串
字节流 = response.read()
字符串 = response.read().decode(“utf-8”)
import urllib.request
# response是响应对象
response = urllib.request.urlopen('https://www.duitang.com/')
# read()把相应对象里面的内容读取出来
html = response.read().decode('utf-8')
print(type(html),html)
结果就转化为了网页源代码格式 字符串类型
urllib.request.Request"⽹址",headers=“字典”) 可以支持重构User-Agent;urlopen()不⽀持重构User-Agent
- 对于有反爬的网页
import urllib.requesturl = 'https://www.baidu.com/'headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3775.400 QQBrowser/10.6.4208.400'
}# 1.创建请求对象
req = urllib.request.Request(url,headers=headers)
# 2.获取响应对象
response = urllib.request.urlopen(req)
# 3.读取响应对象内容 read().decode('utf-8')
html = response.read().decode('utf-8')print(html)
结果就可呈现
- 使用流程
- 利用Request()方法构建请求对象
- 利用urlopen()方法去获取响应对象
- 利用响应对象中的read().decode(‘utf-8’) 读取响应对象内容
1.3 响应对象
read() 读取服务器响应的内容
getcode() 返回HTTP的响应码
import urllib.requesturl = 'https://www.baidu.com/'headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3775.400 QQBrowser/10.6.4208.400'
}# 1.创建请求对象
req = urllib.request.Request(url,headers=headers)
# 2.获取响应对象
response = urllib.request.urlopen(req)
# 3.读取响应对象内容 read().decode('utf-8')
html = response.read().decode('utf-8')# print(html)print(response.getcode()) #返回状态码结果 200print(response.geturl()) #返回实际数据的URL地址 https://www.baidu.com/
geturl() 返回实际数据的URL(防⽌重定向问题)
import urllib.requesturl = 'https://www.baidu.com/'headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3775.400 QQBrowser/10.6.4208.400'
}# 1.创建请求对象
req = urllib.request.Request(url,headers=headers)
# 2.获取响应对象
response = urllib.request.urlopen(req)
# 3.读取响应对象内容 read().decode('utf-8')
html = response.read().decode('utf-8')print(response.geturl()) #返回实际数据的URL地址 https://www.baidu.com/
2 urllib.parse模块
2.1 常⽤⽅法
urlencode(字典),可以实现手动的将汉字变为十六进制
# https://www.baidu.com/s?wd=%E6%B5%B7%E8%B4%BC%E7%8E%8Bimport urllib.parsename = {'wd':'海贼王'}name = urllib.parse.urlencode(name)print(name)
结果
wd=%E6%B5%B7%E8%B4%BC%E7%8E%8B
- 练习一
# 请输入要搜索的内容,并把搜索结果,保存到当前目录 帅哥.htmlimport urllib.request
import urllib.parse# https://www.baidu.com/s?wd=%E6%B5%B7%E8%B4%BC%E7%8E%8B# 拼接url
baseurl = 'https://www.baidu.com/s?'name = input('请输入要搜索的内容:')# 进行urlencode()编码
wd = {'wd':name}name = urllib.parse.urlencode(wd)url = baseurl + name# print(url)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3775.400 QQBrowser/10.6.4208.400','Cookie':'BAIDUID=0E1D7663D747715D94313EFB4E2C33AC:FG=1; BIDUPSID=0E1D7663D747715D94313EFB4E2C33AC; PSTM=1587718856; BD_UPN=1a314753; BDUSS=lna0k2Tm1aVmV1dGgzQmlqSGRORE1EbEJrVTZXNFJNTXJlVHB5eXRmQjJRRDFmSUFBQUFBJCQAAAAAAAAAAAEAAAB73m8ltPS09LXEuMK4wjEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAHazFV92sxVfVT; MCITY=-%3A; BDUSS_BFESS=lna0k2Tm1aVmV1dGgzQmlqSGRORE1EbEJrVTZXNFJNTXJlVHB5eXRmQjJRRDFmSUFBQUFBJCQAAAAAAAAAAAEAAAB73m8ltPS09LXEuMK4wjEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAHazFV92sxVfVT; BDRCVFR[sK1aAlma4-c]=mk3SLVN4HKm; delPer=0; BD_CK_SAM=1; PSINO=1; BDRCVFR[S_ukKV6dOkf]=mk3SLVN4HKm; BD_HOME=1; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; H_PS_645EC=1ca8wohXaH4gHIjrqDXVa0cDekSx3Kaem5kzoR%2BMTsGHRIld8yQe%2BpZqvbk; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; H_PS_PSSID=1421_32439_32532_32328_32348_32045_32270_32115_31322_22157'
}
# 创建请求对象
req = urllib.request.Request(url,headers=headers)# 获取响应对象
res = urllib.request.urlopen(req)# 读取响应对象内容
html = res.read().decode('utf-8')# 写入文件
with open('结果.html','w',encoding='utf-8') as f:f.write(html)
结果
输出一个html文件
quote(字符串) (这个⾥⾯的参数是个字符串)
- 百度贴吧练习一
# 需求:输入要爬取贴吧的名称,输入爬取的起始页和终止页,把每一页保存到本地
# 分析:1.找url的规律
# 第一页 https://tieba.baidu.com/f?kw=%E5%A6%B9%E5%AD%90&pn=0
# 第二页 https://tieba.baidu.com/f?kw=%E5%A6%B9%E5%AD%90&pn=50
# 第三页 https://tieba.baidu.com/f?kw=%E5%A6%B9%E5%AD%90&pn=100
# 第四页 https://tieba.baidu.com/f?kw=%E5%A6%B9%E5%AD%90&pn=(n-1)*50
# 页数规律 pn = (当前页数-1)*50
# 分析:2.获取网页内容
# 分析:3.获取数据
- 使用User-Agent时候,为了不让百度知道是人是鬼在操作,引入random
import random
import urllib.request
import urllib.parse
# 随机获取一个ua,去百度上找常见的User-Agent大全.可以多放几个headers_list = [{'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3775.400 QQBrowser/10.6.4208.400'
},{'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1'}]headers = random.choice(headers_list)name = input('请输入贴吧名:')start = int(input('请输入起始页:'))end = int(input('请输入结束页:'))# 对贴吧name进行编码
kw = {'kw':name}kw = urllib.parse.urlencode(kw)# 拼接url 发请求 获得响应 保存数据
# 不是一页,要循环
for i in range(start,end+1):# 拼接urlpn = (i-1)*50baseurl = 'https://tieba.baidu.com/f?'url = baseurl + kw + '&pn=' + str(pn)# 发起请求req = urllib.request.Request(url,headers=headers)# 获得响应res = urllib.request.urlopen(req)# 读取html = res.read().decode('utf-8')# 写入文件filename = '第' + str(i) + '页.html'with open(filename,'w',encoding='utf-8') as f:print(f'正在爬取第{i}页')f.write(html)
结果
- 练习二
# import random
import urllib.request
import urllib.parse# 读取页面的逻辑封装
def readPage(url):headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3775.400 QQBrowser/10.6.4208.400'}# 发起请求req = urllib.request.Request(url, headers=headers)# 获得响应res = urllib.request.urlopen(req)# 读取html = res.read().decode('utf-8')return html# 写入文件
def writePage(filename,html):with open(filename, 'w', encoding='utf-8') as f:f.write(html)# 主函数
def main():name = input('请输入贴吧名:')start = int(input('请输入起始页:'))end = int(input('请输入结束页:'))# 对贴吧name进行编码kw = {'kw': name}kw = urllib.parse.urlencode(kw)for i in range(start,end+1):# 拼接urlpn = (i - 1) * 50baseurl = 'https://tieba.baidu.com/f?'url = baseurl + kw + '&pn=' + str(pn)html = readPage(url)filename = '第' + str(i) + '页.html'writePage(filename,html)if __name__ == '__main__':main()结果三个页面
请输入贴吧名:帅哥
请输入起始页:1
请输入结束页:3
- 练习三 类对象实现
import urllib.request
import urllib.parseclass BaiduSpider:def __init__(self):#把常用的不变的放到init方法里面self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3775.400 QQBrowser/10.6.4208.400'}self.baseurl = 'https://tieba.baidu.com/f?'def readPage(self,url):# 发起请求req = urllib.request.Request(url, headers=self.headers)# 获得响应res = urllib.request.urlopen(req)# 读取html = res.read().decode('utf-8')return htmldef writePage(self,filename,html):with open(filename,'w',encoding='utf-8') as f:f.write(html)def main(self):name = input('请输入贴吧名:')start = int(input('请输入起始页:'))end = int(input('请输入结束页:'))# 对贴吧name进行编码kw = {'kw': name}kw = urllib.parse.urlencode(kw)for i in range(start, end + 1):# 拼接urlpn = (i - 1) * 50url = self.baseurl + kw + '&pn=' + str(pn)html = self.readPage(url)filename = '第' + str(i) + '页.html'self.writePage(filename, html)if __name__ == '__main__':# 如果要调用类对象中的main方法,#先需要实例化spider = BaiduSpider()spider.main()
3 请求⽅式
-
GET 特点 :查询参数在URL地址中显示
-
POST 在Request⽅法中添加data参数
urllib.request.Request(url,data=data,headers=headers)
data :表单数据以bytes类型提交,不能是str -
有道翻译练习
import urllib.request
import urllib.parse
import json# 请输入你要翻译的内容key = input('请输入要翻译的内容:')# 把需要提交的form表单数据转换为bytes类型的数据 为什么就知道用form表单里的东西data = {'i': key,'from': 'AUTO','smartresult': 'dict','client': 'fanyideskweb','salt': '15980993133958','sign': '8d249124b310aa8e7fa82f24049ff7b7','lts': '1598099313395','bv': '94d04da9bee8870ad9ad8714b54f2bea','doctype': 'json','version': '2.1','keyfrom': 'fanyi.web','action': 'FY_BY_REALTlME'}data = urllib.parse.urlencode(data)# 把data转换为字节
data = bytes(data,'utf-8')# 发请求获取响应 注意需要去掉_o
url = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule'headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3775.400 QQBrowser/10.6.4208.400'}req = urllib.request.Request(url,data=data,headers=headers)
res = urllib.request.urlopen(req)
html = res.read().decode('utf-8')# 这是一个json类型的字符串 {"type":"EN2ZH_CN","errorCode":0,"elapsedTime":1,"translateResult":[[{"src":"money","tgt":"钱"}]]}# 将上述html的json类型的字符串变为字典dict
r_dict = json.loads(html)
r = r_dict['translateResult'] # 结果就是这个列表[[{"src":"money","tgt":"钱"}]]
result = r[0][0]['tgt'] # [{"src":"money","tgt":"钱"}] --> 字典 {"src":"money","tgt":"钱"}
print(result)请输入要翻译的内容:luck
运气
4 requests模块
4.1 安装
pip install requests
在开发⼯具中安装
4.2 request常⽤⽅法
requests.get(⽹址)
4.3 响应对象response的⽅法
response.text 返回unicode格式的数据(str)
print(response.text) # 返回的是str类型
response.content 返回字节流数据(⼆进制)
print(response.content) # 返回的是字节流
response.content.decode(‘utf-8’) ⼿动进⾏解码
print(response.content.decode('utf-8')) # ⼿动进⾏解码
response.url 返回url
response.encode() = ‘编码’
可以解决乱码问题
import requestsresponse = requests.get('http://www.qqbiaoqing.com/gaoxiao/')# print(response.content.decode('utf-8'))response.encoding = 'utf-8' # 如果没有这句,就会乱码
print(response.text)
4.4 requests模块发送 POST请求
import requests
import jsonkey = input('请输入翻译的内容')# 把需要提交的form表单数据转换为bytes类型的数据 为什么就知道用form表单里的东西,因为输入的要素只出现在form表单里,所以去这找结果data = {'i': key,'from': 'AUTO','smartresult': 'dict','client': 'fanyideskweb','salt': '15980993133958','sign': '8d249124b310aa8e7fa82f24049ff7b7','lts': '1598099313395','bv': '94d04da9bee8870ad9ad8714b54f2bea','doctype': 'json','version': '2.1','keyfrom': 'fanyi.web','action': 'FY_BY_REALTlME'}url = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule'headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3775.400 QQBrowser/10.6.4208.400'}res = requests.post(url,data=data,headers=headers)res.encoding = 'utf-8'html = res.text
r_dict = json.loads(html)
result = r_dict['translateResult'][0][0]['tgt']print(result)请输入翻译的内容蜘蛛侠
spider-man
4.5 requests设置代理
使⽤requests添加代理只需要在请求⽅法中(get/post)传递proxies参数就可
以了
设置代理 http://www.httpbin.org/ip
代理⽹站
⻄刺免费代理IP:http://www.xicidaili.com/
快代理:http://www.kuaidaili.com/
代理云:http://www.dailiyun.com/
import requests# 设置代理
proxy = {'http':'116.196.85.190:3128'
}
url = 'http://www.httpbin.org/ip'res = requests.get(url,proxies=proxy)print(res.text)
4.6 cookie
cookie :通过在客户端记录的信息确定⽤户身份,一旦确定之后,不用重复登录
HTTP是⼀种⽆连接协议,客户端和服务器交互仅仅限于 请求/响应过程,结束后断开,下⼀次请求时,服务器会认为是⼀个新的客户端,为了维护他们之间的连接,让服务器知道这是前⼀个⽤户发起的请求,必须在⼀个地⽅保存客户端信息。
- 练习模拟登陆知乎
import requests# resp = requests.get('https://www.baidu.com/')
#
# # print(resp.cookies.get_dict())# 模拟登陆知乎
url = 'https://www2.zhihu.com/hot'headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3775.400 QQBrowser/10.6.4208.400','cookie':'d_c0="AIBf6s8OKxGPTsCiOuJyoRVgNBjHstjKmcg=|1587727340"; _zap=6fc01f58-8fb3-4545-a781-417edf50e819; z_c0="2|1:0|10:1590034345|4:z_c0|92:Mi4xWG5HTEV3QUFBQUFBZ0ZfcXp3NHJFU1lBQUFCZ0FsVk5xVTJ6WHdBbm5kUGtSLXJQb2FMU1hENzN4S2FmbDVaTEln|b476020625a8a9eb38d3924904d14ed3f57ad5a55b29d104cf80d35e81bea1ee"; q_c1=c02ae3a1e68047a9965dc958423c0d2a|1597549527000|1590629112000; _xsrf=gBkpgEOmNKWElWsbjf040VnefHxTTo7h; SESSIONID=oEXxVPA7EVUX5OiYt43xIOXiy7UCkPTslsTf8EGneVW; JOID=U1sQBUPv6PqCRVtPRujzLV46uYVTzs3ar296ambFyd-iaHFuY874tNpOX0pL48DS0lAf9RdCtJESk6_0u8T_Xwg=; osd=UFoUBkPs6f6BRVhOQuvzLl8-uoVQz8nZr2x7bmXFyt6ma3FtYsr7tNlPW0lL4MHW0VAc9BNBtJITl6z0uMX7XAg=; tst=h; KLBRSID=ca494ee5d16b14b649673c122ff27291|1598232043|1598231842; tshl='}resp = requests.get(url,headers=headers)print(resp.text)
4.7 session
session :通过在服务端记录的信息确定⽤户身份 这⾥这个session就是⼀个指的是会话
- 案例演示 首先对12306验证图片的获取–显得臃肿
import base64
# 需要删掉 data:image/jpg;base64,
url = ''img_data = base64.b64decode(url)fn = open('code.png','wb')fn.write(img_data) # binascii.Error: Incorrect paddingfn.close()
- 不臃肿的方式
# https://kyfw.12306.cn/passport/captcha/captcha-image64?login_site=E&module=login&rand=sjrand&1598271087088&callback=jQuery19102009473057696416_1598271069042&_=1598271069044# https://kyfw.12306.cn/passport/captcha/captcha-image64?login_site=E&module=login&rand=sjrand&1598271171181&callback=jQuery19102009473057696416_1598271069042&_=1598271069045# 一样的数据 https://kyfw.12306.cn/passport/captcha/captcha-image64?login_site=E&module=login&rand=sjrand# 删掉64 后 得到12306的验证码图片 https://kyfw.12306.cn/passport/captcha/captcha-image?login_site=E&module=login&rand=sjrand
- 拿到验证码图片,并验证成功–目前还没验证成功,可能是网速问题
# {result_message: "验证码校验成功", result_code: "4"}
# 思路 1. 请求目标url# 2. 拿到12306的图片验证码# 3. 点击正确的验证码图片import requestsreq = requests.session()def login():# 2. 拿到12306的图片验证码,,用base64.b64decode(url)来对图片解码pic_response = req.get('https://kyfw.12306.cn/passport/captcha/captcha-image?login_site=E&module=login&rand=sjrand')# response.content是获取二进制bytes图片codeImage = pic_response.contentfn = open('code2.png','wb')fn.write(codeImage)fn.close()headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3775.400 QQBrowser/10.6.4208.400','Cookie':'_passport_session=98667d47f74a4c7d81305901dd36c9e01660; _passport_ct=62e61089c5a44c0085fcb801c9f58c32t0648; _jc_save_fromStation=%u5317%u4EAC%2CBJP; _jc_save_wfdc_flag=dc; RAIL_EXPIRATION=1598577388645; RAIL_DEVICEID=Z62kUSQ93InJ0ytt_VHqZ0HyI4LgmNJPNCy1YWOmEuLJJ6LnPtaqBp3KaJAlS7wPEQomC44E9qWi3XDmqvSla5-ciBeh66WoG9LP_kC2S876b9gFRBBP7P4MDq1q8HPk97o69ofSz6QtKyoaMyhrj7_-zjRk2SPw; _jc_save_fromDate=2020-08-25; _jc_save_toDate=2020-08-24; _jc_save_toStation=%u6C88%u9633%2CSYT; BIGipServerotn=317719050.38945.0000; route=c5c62a339e7744272a54643b3be5bf64; BIGipServerpool_passport=267190794.50215.0000'}codeStr = input('请输入验证码坐标:')data = {'answer': codeStr, ## '58,53,266,45','rand': 'sjrand','login_site': 'E'}# 思路 1. 请求目标url 注意的是,现在请求方式是get,但是为了简单演示,采用post方式请求response = req.post('https://kyfw.12306.cn/passport/captcha/captcha-check',data=data,headers=headers)print(response.text)# 3. 点击正确的验证码图片login()
- 拿到车此信息
import requestsdef query():headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3775.400 QQBrowser/10.6.4208.400','Cookie': '_uab_collina=159541602490128424256385; JSESSIONID=406B64ECCD5906E667DD5D42AD9045C7; _jc_save_fromStation=%u5317%u4EAC%2CBJP; _jc_save_wfdc_flag=dc; RAIL_EXPIRATION=1598577388645; RAIL_DEVICEID=Z62kUSQ93InJ0ytt_VHqZ0HyI4LgmNJPNCy1YWOmEuLJJ6LnPtaqBp3KaJAlS7wPEQomC44E9qWi3XDmqvSla5-ciBeh66WoG9LP_kC2S876b9gFRBBP7P4MDq1q8HPk97o69ofSz6QtKyoaMyhrj7_-zjRk2SPw; BIGipServerotn=317719050.38945.0000; route=c5c62a339e7744272a54643b3be5bf64; BIGipServerpool_passport=267190794.50215.0000; _jc_save_toStation=%u4E0A%u6D77%2CSHH; _jc_save_toDate=2020-08-25; _jc_save_fromDate=2020-09-01'}r = requests.get('https://kyfw.12306.cn/otn/leftTicket/query?leftTicketDTO.train_date=2020-09-01&leftTicketDTO.from_station=BJP&leftTicketDTO.to_station=SHH&purpose_codes=ADULT',headers=headers)print(r.content.decode('utf-8'))query()
4.8 处理不信任的SSL证书
什么是SSL证书?
SSL证书是数字证书的⼀种,类似于驾驶证、护照和营业执照的电⼦副本。
因为配置在服务器上,也称为SSL服务器证书。SSL 证书就是遵守 SSL协
议,由受信任的数字证书颁发机构CA,在验证服务器身份后颁发,具有服务
器身份验证和数据传输加密功能
import requests
url = 'https://inv-veri.chinatax.gov.cn/'resp = requests.get(url,verify = False) # verify = False加入之后就可以访问了print(resp.text)