X Xerobit

Web Scraping Email Addresses — Ethical Practices and Python Tools

Scrape email addresses from websites ethically using Python requests, BeautifulSoup, and Scrapy. Covers robots.txt compliance, rate limiting, handling JavaScript-rendered...

Mian Ali Khalid · · 5 min read
Use the tool
Email & URL Extractor
Extract every email address and URL from a block of text. Regex-based, case-insensitive, deduplicated, sorted output.
Open Email & URL Extractor →

Web scraping email addresses has legitimate uses — building your own site’s contact database, auditing your own content, research — but requires respecting robots.txt, rate limits, and applicable laws like CAN-SPAM and GDPR.

Use the Email & URL Extractor to extract emails from a single page without scraping.

Simple single-page scraper

import requests
from bs4 import BeautifulSoup
import re
import time

EMAIL_PATTERN = re.compile(
    r'\b[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}\b'
)

def scrape_emails(url: str, delay: float = 1.0) -> list[str]:
    """Scrape emails from a single URL."""
    headers = {
        'User-Agent': 'Mozilla/5.0 (compatible; EmailScraper/1.0; +https://yoursite.com/bot)',
    }
    
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return []
    
    soup = BeautifulSoup(response.text, 'html.parser')
    found = set()
    
    # From mailto links
    for a in soup.find_all('a', href=True):
        if a['href'].startswith('mailto:'):
            email = a['href'][7:].split('?')[0].strip()
            if email:
                found.add(email.lower())
    
    # From page text
    for email in EMAIL_PATTERN.findall(soup.get_text()):
        found.add(email.lower())
    
    time.sleep(delay)  # Be polite
    return sorted(found)

Multi-page crawler

from urllib.parse import urljoin, urlparse
from collections import deque

def crawl_for_emails(start_url: str, max_pages: int = 50) -> dict:
    """Crawl a site and collect all emails found."""
    parsed_start = urlparse(start_url)
    base_domain = parsed_start.netloc
    
    visited = set()
    queue = deque([start_url])
    all_emails = {}
    
    while queue and len(visited) < max_pages:
        url = queue.popleft()
        if url in visited:
            continue
        
        visited.add(url)
        emails = scrape_emails(url)
        if emails:
            all_emails[url] = emails
        
        # Find links to same domain
        try:
            response = requests.get(url, timeout=10)
            soup = BeautifulSoup(response.text, 'html.parser')
            
            for a in soup.find_all('a', href=True):
                link = urljoin(url, a['href'])
                parsed = urlparse(link)
                
                # Only same domain, no fragments or external links
                if (parsed.netloc == base_domain and 
                    link not in visited and
                    not parsed.fragment and
                    parsed.scheme in ('http', 'https')):
                    queue.append(link)
        
        except Exception:
            pass
        
        time.sleep(1)  # 1 second between requests
    
    return all_emails

Check robots.txt before crawling

from urllib.robotparser import RobotFileParser
from urllib.parse import urljoin

def is_allowed(url: str, user_agent: str = '*') -> bool:
    """Check if crawling this URL is allowed by robots.txt."""
    parsed = urlparse(url)
    robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
    
    rp = RobotFileParser()
    rp.set_url(robots_url)
    try:
        rp.read()
    except Exception:
        return True  # If we can't fetch robots.txt, assume allowed
    
    return rp.can_fetch(user_agent, url)

# Usage:
if is_allowed('https://example.com/contact', 'MyBot'):
    emails = scrape_emails('https://example.com/contact')
else:
    print("Robots.txt disallows scraping this URL")

Scrapy spider for email extraction

# scrapy_email/spiders/email_spider.py
import scrapy
import re
from urllib.parse import urljoin

class EmailSpider(scrapy.Spider):
    name = 'email_spider'
    
    EMAIL_PATTERN = re.compile(
        r'\b[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}\b'
    )
    
    custom_settings = {
        'DOWNLOAD_DELAY': 1,
        'ROBOTSTXT_OBEY': True,
        'CONCURRENT_REQUESTS': 1,
    }
    
    def __init__(self, start_url=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.start_urls = [start_url] if start_url else []
        self.allowed_domains = [start_url.split('/')[2]] if start_url else []
    
    def parse(self, response):
        emails = set()
        
        for href in response.css('a[href^="mailto:"]::attr(href)').getall():
            email = href.replace('mailto:', '').split('?')[0].strip()
            if email:
                emails.add(email.lower())
        
        for email in self.EMAIL_PATTERN.findall(response.text):
            emails.add(email.lower())
        
        if emails:
            yield {'url': response.url, 'emails': list(emails)}
        
        for link in response.css('a::attr(href)').getall():
            yield response.follow(link, self.parse)

Handling JavaScript-rendered pages

Some contact pages load dynamically — requests can’t see them:

from playwright.sync_api import sync_playwright  # pip install playwright

def scrape_js_page(url: str) -> list[str]:
    """Scrape emails from a JavaScript-rendered page."""
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True)
        page = browser.new_page()
        page.goto(url, wait_until='networkidle')
        
        # Get rendered HTML
        html = page.content()
        
        # Extract mailto links
        emails = set()
        links = page.query_selector_all('a[href^="mailto:"]')
        for link in links:
            href = link.get_attribute('href') or ''
            email = href.replace('mailto:', '').split('?')[0].strip()
            if email:
                emails.add(email.lower())
        
        # From text content
        text = page.inner_text('body')
        for email in EMAIL_PATTERN.findall(text):
            emails.add(email.lower())
        
        browser.close()
        return sorted(emails)
  • GDPR: In the EU, collecting personal data (including email addresses) without consent is restricted. Only collect emails you have a legitimate basis for.
  • CAN-SPAM: In the US, harvested emails cannot be used for commercial email without compliance measures.
  • robots.txt: Respect it. ROBOTSTXT_OBEY = True in Scrapy does this automatically.
  • Rate limits: Keep DOWNLOAD_DELAY >= 1 second. Aggressive crawling can harm the site.
  • Only scrape your own sites for production email databases.

Related posts

Related tool

Email & URL Extractor

Extract every email address and URL from a block of text. Regex-based, case-insensitive, deduplicated, sorted output.

Written by Mian Ali Khalid. Part of the Dev Productivity pillar.