Skip to content

Link filtering

Link filtering determines which URLs your crawler will follow. Effective filtering reduces unnecessary requests, focuses on target content, and respects domain boundaries. Filters can be based on URL patterns, domains, link attributes, or custom logic.

Filter by URL pattern

Use regular expressions to match specific URL structures:

import re

async def parse(self, response):
    rv = self.response_view(response)

    # Only follow product URLs
    product_pattern = re.compile(r"/products?/[\w-]+")

    for link in rv.doc.cssselect("a"):
        href = link.get("href")
        if href:
            full_url = rv.urljoin(href)
            if product_pattern.search(full_url):
                yield rv.follow(href)

Advanced URL filtering:

import re

async def parse(self, response):
    rv = self.response_view(response)

    # Define multiple patterns
    allowed_patterns = [
        re.compile(r"/products?/[\w-]+"),
        re.compile(r"/categories/[\w-]+"),
        re.compile(r"/search\?q="),
    ]

    # Exclude patterns
    excluded_patterns = [
        re.compile(r"/login"),
        re.compile(r"/logout"),
        re.compile(r"/admin"),
    ]

    for link in rv.doc.cssselect("a"):
        href = link.get("href")
        if href:
            full_url = rv.urljoin(href)

            # Check excluded first
            if any(pattern.search(full_url) for pattern in excluded_patterns):
                continue

            # Then check allowed
            if any(pattern.search(full_url) for pattern in allowed_patterns):
                yield rv.follow(href)

Filter by domain/subdomain

Restrict crawling to specific domains:

from urllib.parse import urlparse

async def parse(self, response):
    rv = self.response_view(response)

    allowed_domains = {"example.com", "shop.example.com"}

    for link in rv.doc.cssselect("a"):
        href = link.get("href")
        if href:
            full_url = rv.urljoin(href)
            domain = urlparse(full_url).netloc

            if domain in allowed_domains:
                yield rv.follow(href)

Domain filtering with subdomains:

from urllib.parse import urlparse

async def parse(self, response):
    rv = self.response_view(response)

    base_domain = "example.com"

    for link in rv.doc.cssselect("a"):
        href = link.get("href")
        if href:
            full_url = rv.urljoin(href)
            domain = urlparse(full_url).netloc

            # Allow base domain and all subdomains
            if domain == base_domain or domain.endswith(f".{base_domain}"):
                yield rv.follow(href)

Select links based on visible text or HTML attributes:

async def parse(self, response):
    rv = self.response_view(response)

    # Only follow links with specific text or CSS classes
    for link in rv.doc.cssselect("a"):
        link_text = link.text_content().strip().lower()
        css_class = link.get("class", "")

        # Follow category or product links
        if "category" in link_text or "product" in css_class:
            href = link.get("href")
            if href:
                yield rv.follow(href)

Filter by data attributes:

async def parse(self, response):
    rv = self.response_view(response)

    # Follow links with specific data attributes
    for link in rv.doc.cssselect("a[data-type]"):
        link_type = link.get("data-type")

        if link_type in {"product", "category", "brand"}:
            href = link.get("href")
            if href:
                yield rv.follow(href)

Filter by file extension

Avoid following links to files:

async def parse(self, response):
    rv = self.response_view(response)

    # File extensions to skip
    skip_extensions = {".pdf", ".jpg", ".png", ".zip", ".exe", ".mp4"}

    for link in rv.doc.cssselect("a"):
        href = link.get("href")
        if href:
            full_url = rv.urljoin(href)

            # Check if URL ends with skipped extension
            if not any(full_url.lower().endswith(ext) for ext in skip_extensions):
                yield rv.follow(href)

Filter by URL parameters

Filter based on query parameters:

from urllib.parse import urlparse, parse_qs

async def parse(self, response):
    rv = self.response_view(response)

    for link in rv.doc.cssselect("a"):
        href = link.get("href")
        if href:
            full_url = rv.urljoin(href)
            parsed = urlparse(full_url)
            params = parse_qs(parsed.query)

            # Only follow search results with category parameter
            if "category" in params:
                yield rv.follow(href)

Combining filters

Use multiple filter criteria together:

import re
from urllib.parse import urlparse

class FilteredSpider(Spider):
    name = "filtered"
    start_urls = ["https://example.com"]

    def should_follow(self, url):
        """Centralized filtering logic."""
        parsed = urlparse(url)

        # Domain filter
        if parsed.netloc != "example.com":
            return False

        # URL pattern filter
        if not re.search(r"/(products?|categories)/", parsed.path):
            return False

        # Extension filter
        if parsed.path.lower().endswith((".pdf", ".jpg", ".png")):
            return False

        return True

    async def parse(self, response):
        rv = self.response_view(response)

        for link in rv.doc.cssselect("a"):
            href = link.get("href")
            if href:
                full_url = rv.urljoin(href)
                if self.should_follow(full_url):
                    yield rv.follow(href)

Best practices

  • Filter early: Avoid scheduling unnecessary requests to reduce queue size
  • Validate URLs before following: Check patterns, domains, and URL structure
  • Test filters incrementally: Start with broad filters, then refine based on results
  • Centralize filter logic: Use helper methods for complex filtering rules
  • Log filtered URLs: Track what's being excluded for debugging
  • Handle edge cases: Account for relative URLs, fragments, and malformed links
  • Use allowed_domains: Consider using built-in domain filtering when available
  • Document filter rules: Comment why certain patterns are included/excluded

See also: Crawl Ordering, Pagination