创建爬虫
可以通过命令行工具快速创建爬虫
创建爬虫
hoopa create -s demo
- -s 表示创建的是单文件爬虫
另外可选参数有-f(--full),可以创建方法的比较齐全的爬虫
hoopa create -s demo2 -f
创建爬虫项目
hoopa create -p demoProject
例子:hoopa create -s demo -f
然后添加开始url: "https://httpbin.org/get"
import hoopa
from hoopa import Spider
from hoopa import Item
class DataItem(Item):
pass
class CommonMiddleware:
def process_request(self, request, spider_ins):
pass
def process_response(self, request, response, spider_ins):
pass
class Demo(Spider):
name = "demo"
middlewares = [CommonMiddleware]
def start_requests(self):
yield hoopa.Request(url="", callback=self.parse)
def parse(self, request, response):
print(response)
def process_item(self, item_list: list):
for item in item_list:
print(item)
if __name__ == "__main__":
Demo.start()
运行爬虫
python demo.py
重写初始化
def start_requests(self):
yield Request(url="https://httpbin.org/json", callback=self.parse)
添加Item
class DataItem(Item):
title: str
type: str
处理响应,解析
async def parse_json(self, request, response):
data = response.json()
slides = data["slideshow"]["slides"]
for slide in slides:
data_item = DataItem()
data_item.title = slide["title"]
data_item.type = slide["type"]
yield data_item
使用中间件
class CommonMiddleware:
def process_request(self, request, spider_ins):
request.timeout = 3
def process_response(self, request, response, spider_ins):
pass
数据存储
这里只打印
async def process_item(self, item_list: list):
for item in item_list:
print(item)
完整代码
# -*- coding: utf-8 -*-
from hoopa import Spider, Request
from hoopa import Item
from hoopa import Middleware
class DataItem(Item):
title: str
type: str
class CommonMiddleware:
def process_request(self, request, spider_ins):
request.timeout = 3
def process_response(self, request, response, spider_ins):
pass
class Demo(Spider):
name = "demo"
start_urls = ["https://httpbin.org/get"]
middlewares = [CommonMiddleware]
async def start_requests(self):
yield Request(url="https://httpbin.org/json", callback=self.parse_json)
async def parse_json(self, request, response):
data = response.json()
slides = data["slideshow"]["slides"]
for slide in slides:
data_item = DataItem()
data_item.title = slide["title"]
data_item.type = slide["type"]
yield data_item
async def process_item(self, item_list: list):
for item in item_list:
print(item)
if __name__ == "__main__":
Demo.start()