云计算

原创文章,欢迎转载。转载请注明:转载自IT人故事会,谢谢!
原文链接地址:「docker实战篇」python的docker爬虫技术-python脚本app抓取(13)

上次已经分析出来具体的app的请求连接了,本次主要说说python的开发,抓取APP里面的信息。源码:https://github.com/limingios/dockerpython.git

分析app数据包

查看分析

解析出来的header

夜神配置

python代码,爬取分类

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2019/1/9 11:06
# @Author  : lm
# @Url     : idig8.com
# @Site    : 
# @File    : spider_douguomeishi.py
# @Software: PyCharm

import requests

#header内容比较多,因为各个厂家的思路不同,
#fiddler爬取出来的字段比较多,有些内容应该是非必填的,只能在实际的时候尝试注释一些来试。
def handle_request(url,data):

    header ={
        client: 4,
        version: 6916.2,
        device: SM-G955N,
        sdk: 22,5.1.1,
        imei: 354730010002552,
        channel: zhuzhan,
        mac: 00:FF:E2:A2:7B:58,
        resolution: 1440*900,
        dpi:2.0,
        android-id:bcdaf527105cc26f,
        pseudo-id:354730010002552,
        brand:samsung,
        scale:2.0,
        timezone:28800,
        language:zh,
        cns:3,
        carrier: Android,
        #imsi: 310260000000000,
        user-agent: Mozilla/5.0 (Linux; Android 5.1.1; SM-G955N Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/39.0.0.0 Mobile Safari/537.36,
        lon: 105.566938,
        lat: 29.99831,
        cid: 512000,
        Content-Type: application/x-www-form-urlencoded; charset=utf-8,
        Accept-Encoding: gzip, deflate,
        Connection: Keep-Alive,
        # Cookie: duid=58349118,
        Host: api.douguo.net,
        #Content-Length: 65
    }
    response = requests.post(url=url,headers=header,data=data)
    return response

def handle_index():
    url = http://api.douguo.net/recipe/flatcatalogs
    # client=4&_session=1547000257341354730010002552&v=1503650468&_vs=0
    data ={
        client:4,
        _session:1547000257341354730010002552,
        v:1503650468,
        _vs:0
    }
    response = handle_request(url,data)
    print(response.text)

handle_index()

爬取详情,信息通过分类找到里面的详情

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2019/1/9 11:06
# @Author  : lm
# @Url     : idig8.com
# @Site    : 
# @File    : spider_douguomeishi.py
# @Software: PyCharm
import json

import requests

from multiprocessing import Queue

#创建队列
queue_list = Queue()

def handle_request(url,data):
    header ={
        client: 4,
        version: 6916.2,
        device: SM-G955N,
        sdk: 22,5.1.1,
        imei: 354730010002552,
        channel: zhuzhan,
        mac: 00:FF:E2:A2:7B:58,
        resolution: 1440*900,
        dpi:2.0,
        android-id:bcdaf527105cc26f,
        pseudo-id:354730010002552,
        brand:samsung,
        scale:2.0,
        timezone:28800,
        language:zh,
        cns:3,
        carrier: Android,
        #imsi: 310260000000000,
        user-agent: Mozilla/5.0 (Linux; Android 5.1.1; SM-G955N Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/39.0.0.0 Mobile Safari/537.36,
        lon: 105.566938,
        lat: 29.99831,
        cid: 512000,
        Content-Type: application/x-www-form-urlencoded; charset=utf-8,
        Accept-Encoding: gzip, deflate,
        Connection: Keep-Alive,
        # Cookie: duid=58349118,
        Host: api.douguo.net,
        #Content-Length: 65
    }
    response = requests.post(url=url,headers=header,data=data)
    return response

def handle_index():
    url = http://api.douguo.net/recipe/flatcatalogs
    # client=4&_session=1547000257341354730010002552&v=1503650468&_vs=0
    data ={
        client:4,
        _session:1547000257341354730010002552,
        v:1503650468,
        _vs:0
    }
    response = handle_request(url,data)
    # print(response.text)
    index_response_dic = json.loads(response.text)
    for item_index in index_response_dic[result][cs]:
        # print(item_index)
        for item_index_cs in item_index[cs]:
            # print(item_index_cs)
            for item in item_index_cs[cs]:
                #print(item)
                data_2 ={
                    client:4,
                    _session:1547000257341354730010002552,
                    keyword:item[name],
                    _vs :400
                }
                #print(data_2)
                queue_list.put(data_2)

handle_index()
print(queue_list.qsize())

分类菜谱内部的详情信息

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2019/1/9 11:06
# @Author  : lm
# @Url     : idig8.com
# @Site    : 
# @File    : spider_douguomeishi.py
# @Software: PyCharm
import json

import requests

from multiprocessing import Queue

#创建队列
queue_list = Queue()

def handle_request(url,data):
    header ={
        client: 4,
        version: 6916.2,
        device: SM-G955N,
        sdk: 22,5.1.1,
        imei: 354730010002552,
        channel: zhuzhan,
        mac: 00:FF:E2:A2:7B:58,
        resolution: 1440*900,
        dpi:2.0,
        android-id:bcdaf527105cc26f,
        pseudo-id:354730010002552,
        brand:samsung,
        scale:2.0,
        timezone:28800,
        language:zh,
        cns:3,
        carrier: Android,
        #imsi: 310260000000000,
        user-agent: Mozilla/5.0 (Linux; Android 5.1.1; SM-G955N Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/39.0.0.0 Mobile Safari/537.36,
        lon: 105.566938,
        lat: 29.99831,
        cid: 512000,
        Content-Type: application/x-www-form-urlencoded; charset=utf-8,
        Accept-Encoding: gzip, deflate,
        Connection: Keep-Alive,
        # Cookie: duid=58349118,
        Host: api.douguo.net,
        #Content-Length: 65
    }
    response = requests.post(url=url,headers=header,data=data)
    return response

def handle_index():
    url = http://api.douguo.net/recipe/flatcatalogs
    # client=4&_session=1547000257341354730010002552&v=1503650468&_vs=0
    data ={
        client:4,
        _session:1547000257341354730010002552,
        v:1503650468,
        _vs:0
    }
    response = handle_request(url,data)
    # print(response.text)
    index_response_dic = json.loads(response.text)
    for item_index in index_response_dic[result][cs]:
        # print(item_index)
        for item_index_cs in item_index[cs]:
            # print(item_index_cs)
            for item in item_index_cs[cs]:
                #print(item)
                data_2 ={
                    client:4,
                    #_session:1547000257341354730010002552,
                    keyword:item[name],
                    _vs :400,
                    order:0
                }
                #print(data_2)
                queue_list.put(data_2)

def handle_caipu_list(data):
    print(当前的食材:,data[keyword])
    caipu_list_url = http://api.douguo.net/recipe/s/0/20;
    caipu_response = handle_request(caipu_list_url, data)
    caipu_response_dict = json.loads(caipu_response.text)
    for caipu_item in caipu_response_dict[result]
: caipu_info ={} caipu_info[shicai] = data[keyword] if caipu_item[type]==13: caipu_info[user_name] = caipu_item[r][an] caipu_info[shicai_id] = caipu_item[r][id] caipu_info[describe] = caipu_item[r][cookstory].replace(\\n,).replace( ,) caipu_info[caipu_name] = caipu_item[r][n] caipu_info[zuoliao_list] = caipu_item[r][major] print(caipu_info) else: continue handle_index() handle_caipu_list(queue_list.get())

菜品内部的详情信息

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2019/1/9 11:06
# @Author  : lm
# @Url     : idig8.com
# @Site    : 
# @File    : spider_douguomeishi.py
# @Software: PyCharm
import json

import requests

from multiprocessing import Queue

#创建队列
queue_list = Queue()

def handle_request(url,data):
    header ={
        client: 4,
        version: 6916.2,
        device: SM-G955N,
        sdk: 22,5.1.1,
        imei: 354730010002552,
        channel: zhuzhan,
        mac: 00:FF:E2:A2:7B:58,
        resolution: 1440*900,
        dpi:2.0,
        android-id:bcdaf527105cc26f,
        pseudo-id:354730010002552,
        brand:samsung,
        scale:2.0,
        timezone:28800,
        language:zh,
        cns:3,
        carrier: Android,
        #imsi: 310260000000000,
        user-agent: Mozilla/5.0 (Linux; Android 5.1.1; SM-G955N Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/39.0.0.0 Mobile Safari/537.36,
        lon: 105.566938,
        lat: 29.99831,
        cid: 512000,
        Content-Type: application/x-www-form-urlencoded; charset=utf-8,
        Accept-Encoding: gzip, deflate,
        Connection: Keep-Alive,
        # Cookie: duid=58349118,
        Host: api.douguo.net,
        #Content-Length: 65
    }
    response = requests.post(url=url,headers=header,data=data)
    return response

def handle_index():
    url = http://api.douguo.net/recipe/flatcatalogs
    # client=4&_session=1547000257341354730010002552&v=1503650468&_vs=0
    data ={
        client:4,
        _session:1547000257341354730010002552,
        v:1503650468,
        _vs:0
    }
    response = handle_request(url,data)
    # print(response.text)
    index_response_dic = json.loads(response.text)
    for item_index in index_response_dic[result][cs]:
        # print(item_index)
        for item_index_cs in item_index[cs]:
            # print(item_index_cs)
            for item in item_index_cs[cs]:
                #print(item)
                data_2 ={
                    client:4,
                    #_session:1547000257341354730010002552,
                    keyword:item[name],
                    _vs :400,
                    order:0
                }
                #print(data_2)
                queue_list.put(data_2)

def handle_caipu_list(data):
    print(当前的食材:,data[keyword])
    caipu_list_url = http://api.douguo.net/recipe/s/0/20;
    caipu_response = handle_request(caipu_list_url, data)
    caipu_response_dict = json.loads(caipu_response.text)
    for caipu_item in caipu_response_dict[result]
: caipu_info ={} caipu_info[shicai] = data[keyword] if caipu_item[type]==13: caipu_info[user_name] = caipu_item[r][an] caipu_info[shicai_id] = caipu_item[r][id] caipu_info[describe] = caipu_item[r][cookstory].replace(\\n,).replace( ,) caipu_info[caipu_name] = caipu_item[r][n] caipu_info[zuoliao_list] = caipu_item[r][major] #print(caipu_info) detail_url = http://api.douguo.net/recipe/detail/+ str(caipu_info[shicai_id]) detail_data ={ client:4, _session:1547000257341354730010002552, author_id:0, _vs:2803, ext:\'{query: {kw: \'+data[keyword]+\', src: 2803, idx: 1, type: 13, id: \'+str(caipu_info[shicai_id])+\'}}\' } detail_reponse = handle_request(detail_url,detail_data) detail_reponse_dic = json.loads(detail_reponse.text) caipu_info[tips] = detail_reponse_dic[result][recipe][tips] caipu_info[cookstep] = detail_reponse_dic[result][recipe][cookstep] print(json.dumps(caipu_info)) else: continue handle_index() handle_caipu_list(queue_list.get())

将数据保存在MongoDB中

通过vagrant 安装虚拟机

vagrant up

进入虚拟机

ip 192.168.66.100

su -
#密码:vagrant
docker

拉取mongodb的镜像

https://hub.docker.com/r/bitnami/mongodb
默认端口:27017

docker pull bitnami/mongodb:latest

创建mongodb的容器


mkdir bitnami
cd bitnami
mkdir mongodb
docker run -d -v /path/to/mongodb-persistence:/root/bitnami -p 27017:27017 bitnami/mongodb:latest

#关闭防火墙
systemctl stop firewalld


>用第三方工具连接

![](https://upload-images.jianshu.io/upload_images/11223715-aea2f13184d728c2.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)

>连接mongodb的工具

``` python
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2019/1/11 0:53
# @Author  :  liming
# @Site    : 
# @File    : handle_mongodb.py
# @url    : idig8.com
# @Software: PyCharm

import pymongo
from pymongo.collection import Collection

class Connect_mongo(object):
    def __init__(self):
        self.client = pymongo.MongoClient(host=192.168.66.100,port=27017)
        self.db_data = self.client[dou_guo_mei_shi]

    def insert_item(self,item):
        db_collection = Collection(self.db_data,\'dou_guo_mei_shi_item\')
        db_collection.insert(item)

# 暴露出来
mongo_info = Connect_mongo()

python爬取的数据通过mongo的工具保存到centos7的docker镜像中

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2019/1/9 11:06
# @Author  : lm
# @Url     : idig8.com
# @Site    : 
# @File    : spider_douguomeishi.py
# @Software: PyCharm
import json

import requests

from multiprocessing import Queue
from handle_mongo import mongo_info

#创建队列
queue_list = Queue()

def handle_request(url,data):
    header ={
        client: 4,
        version: 6916.2,
        device: SM-G955N,
        sdk: 22,5.1.1,
        imei: 354730010002552,
        channel: zhuzhan,
        mac: 00:FF:E2:A2:7B:58,
        resolution: 1440*900,
        dpi:2.0,
        android-id:bcdaf527105cc26f,
        pseudo-id:354730010002552,
        brand:samsung,
        scale:2.0,
        timezone:28800,
        language:zh,
        cns:3,
        carrier: Android,
        #imsi: 310260000000000,
        user-agent: Mozilla/5.0 (Linux; Android 5.1.1; SM-G955N Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/39.0.0.0 Mobile Safari/537.36,
        lon: 105.566938,
        lat: 29.99831,
        cid: 512000,
        Content-Type: application/x-www-form-urlencoded; charset=utf-8,
        Accept-Encoding: gzip, deflate,
        Connection: Keep-Alive,
        # Cookie: duid=58349118,
        Host: api.douguo.net,
        #Content-Length: 65
    }
    response = requests.post(url=url,headers=header,data=data)
    return response

def handle_index():
    url = http://api.douguo.net/recipe/flatcatalogs
    # client=4&_session=1547000257341354730010002552&v=1503650468&_vs=0
    data ={
        client:4,
        _session:1547000257341354730010002552,
        v:1503650468,
        _vs:0
    }
    response = handle_request(url,data)
    # print(response.text)
    index_response_dic = json.loads(response.text)
    for item_index in index_response_dic[result][cs]:
        # print(item_index)
        for item_index_cs in item_index[cs]:
            # print(item_index_cs)
            for item in item_index_cs[cs]:
                #print(item)
                data_2 ={
                    client:4,
                    #_session:1547000257341354730010002552,
                    keyword:item[name],
                    _vs :400,
                    order:0
                }
                #print(data_2)
                queue_list.put(data_2)

def handle_caipu_list(data):
    print(当前的食材:,data[keyword])
    caipu_list_url = http://api.douguo.net/recipe/s/0/20;
    caipu_response = handle_request(caipu_list_url, data)
    caipu_response_dict = json.loads(caipu_response.text)
    for caipu_item in caipu_response_dict[result]
: caipu_info ={} caipu_info[shicai] = data[keyword] if caipu_item[type]==13: caipu_info[user_name] = caipu_item[r][an] caipu_info[shicai_id] = caipu_item[r][id] caipu_info[describe] = caipu_item[r][cookstory].replace(\\n,).replace( ,) caipu_info[caipu_name] = caipu_item[r][n] caipu_info[zuoliao_list] = caipu_item[r][major] #print(caipu_info) detail_url = http://api.douguo.net/recipe/detail/+ str(caipu_info[shicai_id]) detail_data ={ client:4, _session:1547000257341354730010002552, author_id:0, _vs:2803, ext:\'{query: {kw: \'+data[keyword]+\', src: 2803, idx: 1, type: 13, id: \'+str(caipu_info[shicai_id])+\'}}\' } detail_reponse = handle_request(detail_url,detail_data) detail_reponse_dic = json.loads(detail_reponse.text) caipu_info[tips] = detail_reponse_dic[result][recipe][tips] caipu_info[cookstep] = detail_reponse_dic[result][recipe][cookstep] #print(json.dumps(caipu_info)) mongo_info.insert_item(caipu_info) else: continue handle_index() handle_caipu_list(queue_list.get())

通过python多线程-线程池抓取

python3通过concurrent.futures import ThreadPoolExecutor

引用线程池

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2019/1/9 11:06
# @Author  : lm
# @Url     : idig8.com
# @Site    : 
# @File    : spider_douguomeishi.py
# @Software: PyCharm
import json

import requests

from multiprocessing import Queue
from handle_mongo import mongo_info
from concurrent.futures import ThreadPoolExecutor

#创建队列
queue_list = Queue()

def handle_request(url,data):
    header ={
        client: 4,
        version: 6916.2,
        device: SM-G955N,
        sdk: 22,5.1.1,
        imei: 354730010002552,
        channel: zhuzhan,
        mac: 00:FF:E2:A2:7B:58,
        resolution: 1440*900,
        dpi:2.0,
        android-id:bcdaf527105cc26f,
        pseudo-id:354730010002552,
        brand:samsung,
        scale:2.0,
        timezone:28800,
        language:zh,
        cns:3,
        carrier: Android,
        #imsi: 310260000000000,
        user-agent: Mozilla/5.0 (Linux; Android 5.1.1; SM-G955N Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/39.0.0.0 Mobile Safari/537.36,
        lon: 105.566938,
        lat: 29.99831,
        cid: 512000,
        Content-Type: application/x-www-form-urlencoded; charset=utf-8,
        Accept-Encoding: gzip, deflate,
        Connection: Keep-Alive,
        # Cookie: duid=58349118,
        Host: api.douguo.net,
        #Content-Length: 65
    }
    response = requests.post(url=url,headers=header,data=data)
    return response

def handle_index():
    url = http://api.douguo.net/recipe/flatcatalogs
    # client=4&_session=1547000257341354730010002552&v=1503650468&_vs=0
    data ={
        client:4,
        _session:1547000257341354730010002552,
        v:1503650468,
        _vs:0
    }
    response = handle_request(url,data)
    # print(response.text)
    index_response_dic = json.loads(response.text)
    for item_index in index_response_dic[result][cs]:
        # print(item_index)
        for item_index_cs in item_index[cs]:
            # print(item_index_cs)
            for item in item_index_cs[cs]:
                #print(item)
                data_2 ={
                    client:4,
                    #_session:1547000257341354730010002552,
                    keyword:item[name],
                    _vs :400,
                    order:0
                }
                #print(data_2)
                queue_list.put(data_2)

def handle_caipu_list(data):
    print(当前的食材:,data[keyword])
    caipu_list_url = http://api.douguo.net/recipe/s/0/20;
    caipu_response = handle_request(caipu_list_url, data)
    caipu_response_dict = json.loads(caipu_response.text)
    for caipu_item in caipu_response_dict[result]
: caipu_info ={} caipu_info[shicai] = data[keyword] if caipu_item[type]==13: caipu_info[user_name] = caipu_item[r][an] caipu_info[shicai_id] = caipu_item[r][id] caipu_info[describe] = caipu_item[r][cookstory].replace(\\n,).replace( ,) caipu_info[caipu_name] = caipu_item[r][n] caipu_info[zuoliao_list] = caipu_item[r][major] #print(caipu_info) detail_url = http://api.douguo.net/recipe/detail/+ str(caipu_info[shicai_id]) detail_data ={ client:4, _session:1547000257341354730010002552, author_id:0, _vs:2803, ext:\'{query: {kw: \'+data[keyword]+\', src: 2803, idx: 1, type: 13, id: \'+str(caipu_info[shicai_id])+\'}}\' } detail_reponse = handle_request(detail_url,detail_data) detail_reponse_dic = json.loads(detail_reponse.text) caipu_info[tips] = detail_reponse_dic[result][recipe][tips] caipu_info[cookstep] = detail_reponse_dic[result][recipe][cookstep] #print(json.dumps(caipu_info)) mongo_info.insert_item(caipu_info) else: continue handle_index() pool = ThreadPoolExecutor(max_workers=20) while queue_list.qsize()>0: pool.submit(handle_caipu_list,queue_list.get())

通过使用代理IP隐藏爬虫

当app运维人员,发现我们的一直在请求他们的服务器,很可能就把咱们的ip给封了,通过代理ip的方式。隐藏自我。

注册申请 abuyun.com

一个小时1元,我申请了一个小时咱们一起使用下

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2019/1/11 2:40
# @Author  : Aries
# @Site    : 
# @File    : handle_proxy.py
# @Software: PyCharm

#60.17.177.187 代理出来的ip
import  requests
url = \'http://ip.hahado.cn/ip\'
proxy = {\'http\':\'http://H79623F667Q3936C:84F1527F3EE09817@http-cla.abuyun.com:9030\'}
response = requests.get(url=url,proxies=proxy)
print(response.text)

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2019/1/9 11:06
# @Author  : lm
# @Url     : idig8.com
# @Site    : 
# @File    : spider_douguomeishi.py
# @Software: PyCharm
import json

import requests

from multiprocessing import Queue
from handle_mongo import mongo_info
from concurrent.futures import ThreadPoolExecutor

#创建队列
queue_list = Queue()

def handle_request(url,data):
    header ={
        client: 4,
        version: 6916.2,
        device: SM-G955N,
        sdk: 22,5.1.1,
        imei: 354730010002552,
        channel: zhuzhan,
        mac: 00:FF:E2:A2:7B:58,
        resolution: 1440*900,
        dpi:2.0,
        android-id:bcdaf527105cc26f,
        pseudo-id:354730010002552,
        brand:samsung,
        scale:2.0,
        timezone:28800,
        language:zh,
        cns:3,
        carrier: Android,
        #imsi: 310260000000000,
        user-agent: Mozilla/5.0 (Linux; Android 5.1.1; SM-G955N Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/39.0.0.0 Mobile Safari/537.36,
        lon: 105.566938,
        lat: 29.99831,
        cid: 512000,
        Content-Type: application/x-www-form-urlencoded; charset=utf-8,
        Accept-Encoding: gzip, deflate,
        Connection: Keep-Alive,
        # Cookie: duid=58349118,
        Host: api.douguo.net,
        #Content-Length: 65
    }

    proxy = {\'http\': \'http://H79623F667Q3936C:84F1527F3EE09817@http-cla.abuyun.com:9030\'}
    response = requests.post(url=url,headers=header,data=data,proxies=proxy)
    return response

def handle_index():
    url = http://api.douguo.net/recipe/flatcatalogs
    # client=4&_session=1547000257341354730010002552&v=1503650468&_vs=0
    data ={
        client:4,
        _session:1547000257341354730010002552,
        v:1503650468,
        _vs:0
    }
    response = handle_request(url,data)
    # print(response.text)
    index_response_dic = json.loads(response.text)
    for item_index in index_response_dic[result][cs]:
        # print(item_index)
        for item_index_cs in item_index[cs]:
            # print(item_index_cs)
            for item in item_index_cs[cs]:
                #print(item)
                data_2 ={
                    client:4,
                    #_session:1547000257341354730010002552,
                    keyword:item[name],
                    _vs :400,
                    order:0
                }
                #print(data_2)
                queue_list.put(data_2)

def handle_caipu_list(data):
    print(当前的食材:,data[keyword])
    caipu_list_url = http://api.douguo.net/recipe/s/0/20;
    caipu_response = handle_request(caipu_list_url, data)
    caipu_response_dict = json.loads(caipu_response.text)
    for caipu_item in caipu_response_dict[result]
: caipu_info ={} caipu_info[shicai] = data[keyword] if caipu_item[type]==13: caipu_info[user_name] = caipu_item[r][an] caipu_info[shicai_id] = caipu_item[r][id] caipu_info[describe] = caipu_item[r][cookstory].replace(\\n,).replace( ,) caipu_info[caipu_name] = caipu_item[r][n] caipu_info[zuoliao_list] = caipu_item[r][major] #print(caipu_info) detail_url = http://api.douguo.net/recipe/detail/+ str(caipu_info[shicai_id]) detail_data ={ client:4, _session:1547000257341354730010002552, author_id:0, _vs:2803, ext:\'{query: {kw: \'+data[keyword]+\', src: 2803, idx: 1, type: 13, id: \'+str(caipu_info[shicai_id])+\'}}\' } detail_reponse = handle_request(detail_url,detail_data) detail_reponse_dic = json.loads(detail_reponse.text) caipu_info[tips] = detail_reponse_dic[result][recipe][tips] caipu_info[cookstep] = detail_reponse_dic[result][recipe][cookstep] #print(json.dumps(caipu_info)) mongo_info.insert_item(caipu_info) else: continue handle_index() pool = ThreadPoolExecutor(max_workers=2) while queue_list.qsize()>0: pool.submit(handle_caipu_list,queue_list.get())

PS:本次是app数据抓取的入门。首先是通过模拟器的代理服务,到本地的电脑(安装fiddler),这样fiddler就可以抓取数据了,分析数据这块要凭借自己的经验找到对应的url,如果能分析到url,基本爬虫就写一半。封装请求头。通过fiddler获取的。里面header内容比较多,尝试删除最简化,也是一种反爬虫的策略,有的数据放进去到容易被发现是爬虫了,例如cookies等等,但是有的爬虫爬取数据需要cookies。通过代理的方式设置代理ip,防止爬取过程中同一个ip,一直请求一个接口被发现是爬虫。引入了队列的目的就是为了使用线程池的时候方便提取。然后放入mongodb中。这样使用多线程的app数据就完成了。

「docker实战篇」python的docker爬虫技术-python脚本app抓取(13)-编程知识网