Skip to content

Pagination

Pagination is how websites split content across multiple pages. Different sites use different pagination mechanisms: "next" links, numbered pages, "load more" buttons, or infinite scroll. Understanding the pagination pattern is key to extracting all data.

Simple next-page pagination

Follow a "next" or "→" link to the next page:

async def parse(self, response):
    rv = self.response_view(response)

    # Extract items from current page
    for item in rv.doc.cssselect(".item"):
        yield self.extract_item(item)

    # Follow "next" link
    next_link = rv.doc.cssselect("a.next")
    if next_link:
        href = next_link[0].get("href")
        if href:
            yield rv.follow(href)

Alternative selectors for "next" links:

async def parse(self, response):
    rv = self.response_view(response)

    # Extract items
    for item in rv.doc.cssselect(".item"):
        yield self.extract_item(item)

    # Try multiple next-link selectors
    next_selectors = [
        "a.next",
        "a[rel='next']",
        "a:contains('Next')",
        "li.next > a",
        "a.pagination-next",
    ]

    for selector in next_selectors:
        next_links = rv.doc.cssselect(selector)
        if next_links:
            href = next_links[0].get("href")
            if href:
                yield rv.follow(href)
                break

Numbered pagination

Generate page URLs based on page numbers:

from qcrawl.core.request import Request

async def parse(self, response):
    rv = self.response_view(response)

    # Extract items
    for item in rv.doc.cssselect(".item"):
        yield self.extract_item(item)

    # Generate page URLs
    current_page = response.request.meta.get("page", 1)
    max_pages = 10

    if current_page < max_pages:
        next_page = current_page + 1
        next_url = f"https://example.com/items?page={next_page}"
        yield Request(url=next_url, meta={"page": next_page})

Dynamic max_pages detection:

async def parse(self, response):
    rv = self.response_view(response)

    # Extract items
    for item in rv.doc.cssselect(".item"):
        yield self.extract_item(item)

    # Detect total pages from pagination
    pagination_links = rv.doc.cssselect(".pagination a")
    if pagination_links:
        # Extract page numbers from links
        page_numbers = []
        for link in pagination_links:
            text = link.text_content().strip()
            if text.isdigit():
                page_numbers.append(int(text))

        max_pages = max(page_numbers) if page_numbers else 1
    else:
        max_pages = 1

    current_page = response.request.meta.get("page", 1)

    if current_page < max_pages:
        next_page = current_page + 1
        next_url = f"https://example.com/items?page={next_page}"
        yield Request(url=next_url, meta={"page": next_page})

Load-more / infinite scroll

Handle AJAX-based pagination:

from qcrawl.core.request import Request

async def parse(self, response):
    rv = self.response_view(response)

    # Extract items
    for item in rv.doc.cssselect(".item"):
        yield self.extract_item(item)

    # Check for AJAX endpoint
    page = response.request.meta.get("page", 1)
    max_pages = 20

    if page < max_pages:
        # API endpoint that returns more items
        api_url = f"https://example.com/api/items?offset={page * 20}"
        yield Request(
            url=api_url,
            meta={"page": page + 1}
        )

Handle JSON responses:

async def parse(self, response):
    # Check if response is JSON
    if response.headers.get("Content-Type", "").startswith("application/json"):
        data = response.json()

        # Extract items from JSON
        for item in data.get("items", []):
            yield {
                "title": item.get("title"),
                "price": item.get("price"),
            }

        # Check for next page
        if data.get("has_more"):
            page = response.request.meta.get("page", 1)
            next_url = f"https://example.com/api/items?offset={page * 20}"
            yield Request(url=next_url, meta={"page": page + 1})
    else:
        # Handle HTML response
        rv = self.response_view(response)
        for item in rv.doc.cssselect(".item"):
            yield self.extract_item(item)

Cursor-based pagination

Handle cursor/token-based pagination (common in APIs):

async def parse(self, response):
    data = response.json()

    # Extract items
    for item in data.get("results", []):
        yield {
            "id": item["id"],
            "name": item["name"],
        }

    # Follow next cursor
    next_cursor = data.get("next_cursor")
    if next_cursor:
        next_url = f"https://api.example.com/items?cursor={next_cursor}"
        yield Request(url=next_url)

Offset-based pagination

Common in REST APIs:

async def parse(self, response):
    data = response.json()

    # Extract items
    items = data.get("items", [])
    for item in items:
        yield item

    # Calculate next offset
    current_offset = response.request.meta.get("offset", 0)
    limit = 20
    total = data.get("total_count")

    if total and current_offset + limit < total:
        next_offset = current_offset + limit
        next_url = f"https://api.example.com/items?offset={next_offset}&limit={limit}"
        yield Request(
            url=next_url,
            meta={"offset": next_offset}
        )

Pagination with state tracking

Track pagination state across requests:

async def parse(self, response):
    rv = self.response_view(response)

    # Extract items
    items_found = 0
    for item in rv.doc.cssselect(".item"):
        yield self.extract_item(item)
        items_found += 1

    # Track cumulative stats
    total_items = response.request.meta.get("total_items", 0) + items_found
    current_page = response.request.meta.get("page", 1)

    # Follow next page
    next_link = rv.doc.cssselect("a.next")
    if next_link and items_found > 0:
        href = next_link[0].get("href")
        if href:
            yield rv.follow(
                href,
                meta={
                    "page": current_page + 1,
                    "total_items": total_items
                }
            )

Handling pagination errors

Gracefully handle pagination edge cases:

async def parse(self, response):
    rv = self.response_view(response)

    # Extract items
    items = rv.doc.cssselect(".item")

    # Check if page has content
    if not items:
        self.logger.warning(f"No items found on page: {response.url}")
        return

    for item in items:
        yield self.extract_item(item)

    # Safety limit to prevent infinite loops
    current_page = response.request.meta.get("page", 1)
    max_pages = 100

    if current_page >= max_pages:
        self.logger.warning(f"Reached max pages limit: {max_pages}")
        return

    # Follow next link with error handling
    next_link = rv.doc.cssselect("a.next")
    if next_link:
        href = next_link[0].get("href")
        if href:
            yield rv.follow(href, meta={"page": current_page + 1})

Best practices

  • Handle pagination limits: Set reasonable max_pages to prevent infinite loops
  • Track page numbers: Use meta to pass page state through requests
  • Detect pagination type: Identify whether site uses links, numbers, or AJAX
  • Validate pagination: Check for empty pages or duplicate content
  • Log pagination progress: Track pages processed for debugging
  • Handle edge cases: Account for single-page results, broken pagination
  • Respect rate limits: Don't hammer pagination endpoints too quickly
  • Test thoroughly: Verify pagination works from first to last page
  • Stop on empty pages: Exit when no more items are found

See also: Crawl Ordering, Link Filtering