scrapy模拟登录

2022-06-15

字数统计: 219字 | 阅读时长: 1min

import scrapy


class LoginSpider(scrapy.Spider):
    name = 'login'
    start_urls = ['https://github.com/login']

    def parse(self, response):
        ever_token = response.xpath('//*[@id="login"]/div[4]/form/input[1]/@value').extract_first()
        timestamp_secret = response.xpath('//*[@id="login"]/div[4]/form/div/input[11]/@value').extract_first()
        timestamp = response.xpath('//*[@id="login"]/div[4]/form/div/input[10]/@value').extract_first()

        data = {
            "commit": "Sign in",
            "authenticity_token": ever_token, # token每次都会变,可以在login网页中查找到
            "login": "xxx", # 填成自己的
            "password": "xxx", # 填成自己的
            "webauthn-support": "supported",
            "webauthn-iuvpaa-support": "unsupported",
            "return_to": "https://github.com/login",
            "timestamp": timestamp,
            "timestamp_secret": timestamp_secret
        }

        yield scrapy.FormRequest(  # 用的就是post方式
            url='https://github.com/session', # post网址
            callback=self.after_login,
            formdata=data
        )

    def after_login(self, response):
        yield scrapy.Request('https://github.com/yezhoubing', callback=self.check_login)

    def check_login(self, response):
        print(response.xpath('html/head/title/text()').extract_first())

展开全文 >>

scrapy使用selenium

2022-06-15

字数统计: 124字 | 阅读时长: 1min

在爬虫文件中

1 2	def __init__(self): self.bro = Chrome() # 默认路径为python.exe所在文件夹

展开全文 >>

scrapy异步下载图片

2022-06-15

字数统计: 318字 | 阅读时长: 1min

scrapy异步下载图片

通过 from scrapy.pipelines.images import ImagesPipeline管道下载，可以考虑自己重写，从而修改默认的方式

爬虫文件

"""scrapy异步下载图片"""

import scrapy
from selenium.webdriver import Chrome
from ..items import ServantPicItem
import pandas as pd
import numpy as np
from scrapy.pipelines.images import ImagesPipeline
class ServantSpider(scrapy.Spider):
    name_list = []
    img_list = []
    name = 'servant'
    start_urls = ['https://fgo.wiki/w/%E8%8B%B1%E7%81%B5%E5%9B%BE%E9%89%B4']



    def parse(self, response):
        item = ServantPicItem()
      

        # 读取本地csv文件
        data = pd.read_csv("link.csv",usecols=["图片名称"])
        data_array1 = np.array(data.stack())  # 首先将pandas读取的数据转化为array
        self.name_list = data_array1.tolist()
        data = pd.read_csv("link.csv",usecols=["图片链接"])
        data_array2 = np.array(data.stack())  # 首先将pandas读取的数据转化为array
        self.img_list = data_array2.tolist()  # 然后转化为list形式
        # item['name'] = self.name_list
        item['image_urls'] = self.img_list # 必须是img_urls,这是图片下载文件中有的
        yield item