Data extraction
Extraction vs Pipeline Processing
Best practice: Separate extraction from transformation.
In the Spider (extraction):
- Extract raw or lightly cleaned data
- Use basic
.strip()for whitespace - Check element existence
- Provide fallbacks for optional fields
In Item Pipelines (transformation):
- Complex text cleaning (regex, normalization)
- Price/date parsing and formatting
- Type conversions and validation
- Business logic and data enrichment
Example pattern:
# Spider: Extract raw data with minimal cleaning
async def parse(self, response):
rv = self.response_view(response)
for product in rv.doc.cssselect(".product"):
title = product.cssselect("h2")
price = product.cssselect(".price")
yield {
"title": title[0].text_content().strip() if title else None,
"price": price[0].text_content().strip() if price else None, # Raw: "$1,234.56"
"url": response.url,
}
# Pipeline: Clean and transform (in pipelines, not spider)
class DataCleaningPipeline(ItemPipeline):
async def process_item(self, item, spider):
# Transform "$1,234.56" → 1234.56
if "price" in item.data:
item.data["price"] = self.parse_price(item.data["price"])
return item
For data cleaning, transformation, and validation examples, see Item Pipeline.
Structured data extraction
Extract with Item class (recommended)
from qcrawl.core.item import Item
async def parse(self, response):
rv = self.response_view(response)
for product in rv.doc.cssselect(".product"):
# Extract with fallbacks
title = product.cssselect("h2.title")
price = product.cssselect(".price")
rating = product.cssselect(".rating")
yield Item(data={
"title": title[0].text_content().strip() if title else None,
"price": price[0].text_content().strip() if price else None,
"rating": float(rating[0].get("data-rating")) if rating else None,
"url": response.url,
})
Extract with plain dict
async def parse(self, response):
rv = self.response_view(response)
for article in rv.doc.cssselect("article"):
title_elem = article.cssselect("h2")
author_elem = article.cssselect(".author")
date_elem = article.cssselect("time")
yield {
"title": title_elem[0].text_content().strip() if title_elem else None,
"author": author_elem[0].text_content().strip() if author_elem else None,
"published": date_elem[0].get("datetime") if date_elem else None,
"url": response.url,
}
Nested data extraction
Extract nested attributes
async def parse(self, response):
rv = self.response_view(response)
for product in rv.doc.cssselect(".product"):
# Extract nested key-value pairs
attributes = {}
for attr in product.cssselect(".attribute"):
key_elem = attr.cssselect(".key")
value_elem = attr.cssselect(".value")
if key_elem and value_elem:
key = key_elem[0].text_content().strip()
value = value_elem[0].text_content().strip()
attributes[key] = value
# Extract list of images
images = [
img.get("src")
for img in product.cssselect("img.gallery")
if img.get("src")
]
yield {
"title": product.cssselect("h2")[0].text_content(),
"attributes": attributes,
"images": images,
}
Extract hierarchical data
async def parse(self, response):
rv = self.response_view(response)
for category in rv.doc.cssselect(".category"):
category_name = category.cssselect("h2")[0].text_content()
# Extract nested subcategories
subcategories = []
for subcat in category.cssselect(".subcategory"):
subcat_name = subcat.cssselect("h3")[0].text_content()
# Extract items within subcategory
items = [
item.text_content().strip()
for item in subcat.cssselect("li")
]
subcategories.append({
"name": subcat_name,
"items": items
})
yield {
"category": category_name,
"subcategories": subcategories
}
API and JSON data
Parse JSON responses
from qcrawl.core.request import Request
async def parse(self, response):
# Parse JSON response
data = response.json()
# Extract items
for item in data.get("results", []):
yield {
"id": item.get("id"),
"name": item.get("name"),
"price": item.get("price"),
"stock": item.get("stock_count", 0)
}
# Handle pagination
next_page = data.get("next_page")
if next_page:
yield Request(url=next_page)
Parse JSON within HTML
import json
async def parse(self, response):
rv = self.response_view(response)
# Extract JSON from script tag
script_tags = rv.doc.cssselect("script[type='application/ld+json']")
for script in script_tags:
try:
data = json.loads(script.text_content())
if data.get("@type") == "Product":
yield {
"name": data.get("name"),
"price": data.get("offers", {}).get("price"),
"currency": data.get("offers", {}).get("priceCurrency"),
"rating": data.get("aggregateRating", {}).get("ratingValue"),
}
except (json.JSONDecodeError, AttributeError):
continue
Handle GraphQL APIs
async def start_requests(self):
query = """
query GetProducts($page: Int!) {
products(page: $page) {
id
name
price
}
}
"""
yield Request(
url="https://api.example.com/graphql",
method="POST",
body={
"query": query,
"variables": {"page": 1}
},
headers={"Content-Type": "application/json"}
)
async def parse(self, response):
data = response.json()
products = data.get("data", {}).get("products", [])
for product in products:
yield {
"id": product["id"],
"name": product["name"],
"price": product["price"]
}
Sitemap crawling
Parse XML sitemaps
from qcrawl.core.spider import Spider
from qcrawl.core.request import Request
import xml.etree.ElementTree as ET
class SitemapSpider(Spider):
name = "sitemap_spider"
start_urls = ["https://example.com/sitemap.xml"]
async def parse(self, response):
# Parse sitemap XML
root = ET.fromstring(response.text)
namespace = {"ns": "http://www.sitemaps.org/schemas/sitemap/0.9"}
# Extract URLs
for url_elem in root.findall(".//ns:url/ns:loc", namespace):
page_url = url_elem.text
# Optionally filter URLs
if "/products/" in page_url:
yield Request(url=page_url)
# Handle sitemap index (sitemap of sitemaps)
for sitemap in root.findall(".//ns:sitemap/ns:loc", namespace):
yield Request(url=sitemap.text)
Extract sitemap metadata
async def parse(self, response):
root = ET.fromstring(response.text)
namespace = {"ns": "http://www.sitemaps.org/schemas/sitemap/0.9"}
for url_elem in root.findall(".//ns:url", namespace):
loc = url_elem.find("ns:loc", namespace)
lastmod = url_elem.find("ns:lastmod", namespace)
priority = url_elem.find("ns:priority", namespace)
if loc is not None:
yield Request(
url=loc.text,
meta={
"lastmod": lastmod.text if lastmod is not None else None,
"priority": priority.text if priority is not None else None
}
)
Best practices
- Separate extraction from transformation: Extract raw data in spiders, clean/transform/validate in pipelines
- Use Item for structured data: Wrap data in
Itemclass for pipeline processing - Extract data defensively: Always check if elements exist before accessing
- Document data schema: Comment expected data structure and field types
- Store raw URLs: Include source URL in extracted data for debugging (use Items metadata)
See also: Item Pipeline, Items