$ scrapy shell http://quotes.toscrape.com/random
>>> response.css('small.author::text').extract_first()
>>> response.css('span.text::text').extract_first()
>>> response.css('a.tag::text').extract()
# -*- coding: utf-8 -*-
import scrapy
class SingleQuoteSpider(scrapy.Spider):
name = "single-quote"
allowed_domains = ["toscrape.com"]
start_urls = ['http://quotes.toscrape.com/random']
def parse(self, response):
self.log('I just visited: ' + response.url)
for quote in response.css('div.quote'):
item = {
'author_name': quote.css('small.author::text').extract_first(),
'text': quote.css('span.text::text').extract_first(),
'tags': quote.css('a.tag::text').extract(),
}
yield item
# -*- coding: utf-8 -*-
import scrapy
class MultipleQuotesSpider(scrapy.Spider):
name = "multiple-quotes"
allowed_domains = ["toscrape.com"]
start_urls = ['http://quotes.toscrape.com']
def parse(self, response):
self.log('I just visited: ' + response.url)
for quote in response.css('div.quote'):
item = {
'author_name': quote.css('small.author::text').extract_first(),
'text': quote.css('span.text::text').extract_first(),
'tags': quote.css('a.tag::text').extract(),
}
yield item
# -*- coding: utf-8 -*-
import scrapy
class MultipleQuotesPaginationSpider(scrapy.Spider):
name = "multiple-quotes-pagination"
allowed_domains = ["toscrape.com"]
start_urls = ['http://quotes.toscrape.com']
def parse(self, response):
self.log('I just visited: ' + response.url)
for quote in response.css('div.quote'):
item = {
'author_name': quote.css('small.author::text').extract_first(),
'text': quote.css('span.text::text').extract_first(),
'tags': quote.css('a.tag::text').extract(),
}
yield item
# follow pagination link
next_page_url = response.css('li.next > a::attr(href)').extract_first()
if next_page_url:
next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(url=next_page_url, callback=self.parse)
# -*- coding: utf-8 -*-
import scrapy
class AuthorsSpider(scrapy.Spider):
name = "authors"
start_urls = ['http://quotes.toscrape.com']
def parse(self, response):
urls = response.css('div.quote > span > a::attr(href)').extract()
for url in urls:
url = response.urljoin(url)
yield scrapy.Request(url=url, callback=self.parse_details)
# follow pagination link
next_page_url = response.css('li.next > a::attr(href)').extract_first()
if next_page_url:
next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(url=next_page_url, callback=self.parse)
def parse_details(self, response):
yield {
'name': response.css('h3.author-title::text').extract_first(),
'birth_date': response.css('span.author-born-date::text').extract_first(),
}
# -*- coding: utf-8 -*-
import json
import scrapy
class QuotesInfiniteScrollSpider(scrapy.Spider):
name = "quotes-infinite-scroll"
api_url = 'http://quotes.toscrape.com/api/quotes?page={}'
start_urls = [api_url.format(1)]
def parse(self, response):
data = json.loads(response.text)
for quote in data['quotes']:
yield {
'author_name': quote['author']['name'],
'text': quote['text'],
'tags': quote['tags'],
}
if data['has_next']:
next_page = data['page'] + 1
yield scrapy.Request(url=self.api_url.format(next_page), callback=self.parse)
# -*- coding: utf-8 -*-
import scrapy
class QuotesLoginSpider(scrapy.Spider):
name = 'quotes-login'
login_url = 'http://quotes.toscrape.com/login'
start_urls = [login_url]
def parse(self, response):
# extract the csrf token value
token = response.css('input[name="csrf_token"]::attr(value)').extract_first()
# create a python dictionary with the form values
data = {
'csrf_token': token,
'username': 'abc',
'password': 'abc',
}
# submit a POST request to it
yield scrapy.FormRequest(url=self.login_url, formdata=data, callback=self.parse_quotes)
def parse_quotes(self, response):
"""Parse the main page after the spider is logged in"""
for q in response.css('div.quote'):
yield {
'author_name': q.css('small.author::text').extract_first(),
'author_url': q.css(
'small.author ~ a[href*="goodreads.com"]::attr(href)'
).extract_first()
}
import scrapy
from scrapy_splash import SplashRequest
class QuotesJSSpider(scrapy.Spider):
name = 'quotesjs'
# all these settings can be put in your project's settings.py file
custom_settings = {
'SPLASH_URL': 'http://localhost:8050',
'DOWNLOADER_MIDDLEWARES': {
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
},
'SPIDER_MIDDLEWARES': {
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
},
'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
}
def start_requests(self):
yield SplashRequest(
url='http://quotes.toscrape.com/js',
callback=self.parse,
)
def parse(self, response):
for quote in response.css("div.quote"):
yield {
'text': quote.css("span.text::text").extract_first(),
'author': quote.css("small.author::text").extract_first(),
'tags': quote.css("div.tags > a.tag::text").extract(),
}
$ pip install shub
$ shub login
$ shub deploy
$ shub schedule quotes