Examples
Quote
"You don't learn to walk by following rules. You learn by doing, and by falling over." — Richard Branson
This page provides practical examples demonstrating different qCrawl features and common use cases.
CSS selectors with custom settings
This example demonstrates CSS selectors, custom spider settings, and error handling patterns.
import logging
from cssselect import SelectorError
from qcrawl.core.item import Item
from qcrawl.core.spider import Spider
logger = logging.getLogger(__name__)
class Quotes(Spider):
name = "quotes_css"
start_urls = ["https://quotes.toscrape.com/"]
custom_settings = {
"CONCURRENCY": 10,
"CONCURRENCY_PER_DOMAIN": 10,
"DELAY_PER_DOMAIN": 0.25,
"MAX_DEPTH": 0, # unlimited depth
"TIMEOUT": 30.0,
"MAX_RETRIES": 3,
"USER_AGENT": "qCrawl-Examples/1.0",
"REQUIRED_FIELDS": ["text", "author"],
"DEFAULT_REQUEST_HEADERS": {
"Accept": "text/html",
"User-Agent": "qCrawl-Examples/1.0",
},
"PIPELINES": {
"qcrawl.pipelines.validation.ValidationPipeline": 200,
},
}
async def parse(self, response):
rv = self.response_view(response)
# Safely handle CSS selector errors (malformed or unsupported selectors)
try:
quotes = rv.doc.cssselect("div.quote")
except SelectorError:
# Skip page gracefully without raising an error
return
for q in quotes:
try:
text = q.cssselect("span.text")[0].text_content().strip()
author = q.cssselect("small.author")[0].text_content().strip()
tags = [t.text_content().strip() for t in q.cssselect("div.tags a.tag")]
except (IndexError, SelectorError):
# Skip items with missing elements or selector issues
continue
yield Item(
data={"url": response.url, "text": text, "author": author, "tags": tags},
metadata={"source": "quotes.toscrape.com"},
)
# Follow pagination links (rv.follow resolves relative URLs automatically)
next_a = rv.doc.cssselect("li.next a")
if next_a:
href = next_a[0].get("href")
if href:
yield rv.follow(href)
For settings setup including custom_settings, see Settings documentation.
XPath selectors
This example shows how to use XPath selectors for more powerful element selection and text extraction.
import logging
from qcrawl.core.item import Item
from qcrawl.core.spider import Spider
logger = logging.getLogger(__name__)
class QuotesXPath(Spider):
name = "quotes_xpath"
start_urls = ["https://quotes.toscrape.com/"]
custom_settings = {
"CONCURRENCY": 10,
"USER_AGENT": "qCrawl-XPath-Example/1.0",
}
async def parse(self, response):
rv = self.response_view(response)
# Using XPath selectors instead of CSS
quotes = rv.doc.xpath("//div[@class='quote']")
for q in quotes:
# XPath can extract text directly
text_nodes = q.xpath(".//span[@class='text']/text()")
author_nodes = q.xpath(".//small[@class='author']/text()")
tag_nodes = q.xpath(".//div[@class='tags']/a[@class='tag']/text()")
# Safely extract values
if not text_nodes or not author_nodes:
continue
text = text_nodes[0].strip()
author = author_nodes[0].strip()
tags = [t.strip() for t in tag_nodes]
yield Item(
data={"url": response.url, "text": text, "author": author, "tags": tags},
metadata={"selector_type": "xpath"},
)
# Follow pagination using XPath
next_link = rv.doc.xpath("//li[@class='next']/a/@href")
if next_link:
yield rv.follow(next_link[0])
XPath vs CSS Selectors:
- XPath:
//div[@class='quote']- More powerful, can navigate parent/sibling nodes - CSS:
div.quote- Simpler syntax, better browser DevTools support
See Selectors documentation for more details.
API and JSON scraping
This example demonstrates scraping JSON APIs, handling content-type validation, and parsing Link headers for pagination.
import logging
from qcrawl.core.item import Item
from qcrawl.core.request import Request
from qcrawl.core.spider import Spider
logger = logging.getLogger(__name__)
class JSONApiSpider(Spider):
name = "json_api"
start_urls = ["https://api.github.com/users/github/repos?page=1&per_page=10"]
custom_settings = {
"CONCURRENCY": 5,
"USER_AGENT": "qCrawl-API-Example/1.0",
"DEFAULT_REQUEST_HEADERS": {
"Accept": "application/json",
},
}
async def parse(self, response):
# Check if response is JSON
if not response.headers.get("content-type", "").startswith("application/json"):
logger.warning(f"Expected JSON but got {response.headers.get('content-type')}")
return
# Parse JSON response
try:
repos = response.json()
except Exception as e:
logger.error(f"Failed to parse JSON from {response.url}: {e}")
return
# Extract data from JSON
for repo in repos:
yield Item(
data={
"name": repo.get("name"),
"full_name": repo.get("full_name"),
"description": repo.get("description"),
"stars": repo.get("stargazers_count", 0),
"forks": repo.get("forks_count", 0),
"language": repo.get("language"),
"url": repo.get("html_url"),
},
metadata={"api_source": "github"},
)
# Follow pagination in API (if Link header exists)
link_header = response.headers.get("Link", "")
if 'rel="next"' in link_header:
# Parse Link header to get next URL
# Format: <https://api.github.com/...?page=2>; rel="next"
next_url = link_header.split(";")[0].strip("<>")
yield Request(url=next_url)
# Alternative: pagination via query params
# current_page = int(response.url.split("page=")[1].split("&")[0])
# if len(repos) > 0: # Has more results
# next_page = current_page + 1
# yield Request(url=f"https://api.github.com/users/github/repos?page={next_page}&per_page=10")
Key points for API scraping:
- Use
response.json()to parse JSON responses - Check
Content-Typeheader to verify JSON - Handle pagination via Link headers or query parameters
- Set appropriate
Acceptheader inDEFAULT_REQUEST_HEADERS
Running spiders programmatically (without CLI)
Use SpiderRunner to run any spider programmatically from Python code:
import asyncio
from qcrawl.runner.run import SpiderRunner
async def main():
runner = SpiderRunner(
settings={
"CONCURRENCY": 4,
"LOG_LEVEL": "INFO",
}
)
# Run any spider (e.g., QuotesSpider from examples above)
await runner.crawl(QuotesSpider)
if __name__ == "__main__":
asyncio.run(main())
This is useful for embedding qCrawl in applications, scripts, or automated workflows.
See "Advanced Topics" section for more complex patterns like collecting items programmatically, running multiple spiders, or using pipelines and signals.