Skip to content

创建爬虫

可以通过命令行工具快速创建爬虫

创建爬虫

hoopa create -s demo
  • -s 表示创建的是单文件爬虫

另外可选参数有-f(--full),可以创建方法的比较齐全的爬虫

hoopa create -s demo2 -f

创建爬虫项目

hoopa create -p demoProject

例子:hoopa create -s demo -f

然后添加开始url: "https://httpbin.org/get"

import hoopa
from hoopa import Spider
from hoopa import Item


class DataItem(Item):
    pass


class CommonMiddleware:
    def process_request(self, request, spider_ins):
        pass

    def process_response(self, request, response, spider_ins):
        pass


class Demo(Spider):
    name = "demo"
    middlewares = [CommonMiddleware]

    def start_requests(self):
        yield hoopa.Request(url="", callback=self.parse)

    def parse(self, request, response):
        print(response)

    def process_item(self, item_list: list):
        for item in item_list:
            print(item)


if __name__ == "__main__":
    Demo.start()

运行爬虫

python demo.py

重写初始化

def start_requests(self):
    yield Request(url="https://httpbin.org/json", callback=self.parse)

添加Item

class DataItem(Item):
    title: str
    type: str

处理响应,解析

async def parse_json(self, request, response):
    data = response.json()
    slides = data["slideshow"]["slides"]
    for slide in slides:
        data_item = DataItem()
        data_item.title = slide["title"]
        data_item.type = slide["type"]
        yield data_item

使用中间件

class CommonMiddleware:
    def process_request(self, request, spider_ins):
        request.timeout = 3

    def process_response(self, request, response, spider_ins):
        pass

数据存储

这里只打印

async def process_item(self, item_list: list):
    for item in item_list:
        print(item)

完整代码

# -*- coding: utf-8 -*-

from hoopa import Spider, Request
from hoopa import Item
from hoopa import Middleware


class DataItem(Item):
    title: str
    type: str


class CommonMiddleware:
    def process_request(self, request, spider_ins):
        request.timeout = 3

    def process_response(self, request, response, spider_ins):
        pass


class Demo(Spider):
    name = "demo"
    start_urls = ["https://httpbin.org/get"]
    middlewares = [CommonMiddleware]

    async def start_requests(self):
        yield Request(url="https://httpbin.org/json", callback=self.parse_json)

    async def parse_json(self, request, response):
        data = response.json()
        slides = data["slideshow"]["slides"]
        for slide in slides:
            data_item = DataItem()
            data_item.title = slide["title"]
            data_item.type = slide["type"]
            yield data_item

    async def process_item(self, item_list: list):
        for item in item_list:
            print(item)


if __name__ == "__main__":
    Demo.start()