Web Scraping Email Addresses — Ethical Practices and Python Tools
Scrape email addresses from websites ethically using Python requests, BeautifulSoup, and Scrapy. Covers robots.txt compliance, rate limiting, handling JavaScript-rendered...
Use the tool
Email & URL Extractor
Extract every email address and URL from a block of text. Regex-based, case-insensitive, deduplicated, sorted output.
Web scraping email addresses has legitimate uses — building your own site’s contact database, auditing your own content, research — but requires respecting robots.txt, rate limits, and applicable laws like CAN-SPAM and GDPR.
Use the Email & URL Extractor to extract emails from a single page without scraping.
Simple single-page scraper
import requests
from bs4 import BeautifulSoup
import re
import time
EMAIL_PATTERN = re.compile(
r'\b[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}\b'
)
def scrape_emails(url: str, delay: float = 1.0) -> list[str]:
"""Scrape emails from a single URL."""
headers = {
'User-Agent': 'Mozilla/5.0 (compatible; EmailScraper/1.0; +https://yoursite.com/bot)',
}
try:
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
except requests.RequestException as e:
print(f"Error fetching {url}: {e}")
return []
soup = BeautifulSoup(response.text, 'html.parser')
found = set()
# From mailto links
for a in soup.find_all('a', href=True):
if a['href'].startswith('mailto:'):
email = a['href'][7:].split('?')[0].strip()
if email:
found.add(email.lower())
# From page text
for email in EMAIL_PATTERN.findall(soup.get_text()):
found.add(email.lower())
time.sleep(delay) # Be polite
return sorted(found)
Multi-page crawler
from urllib.parse import urljoin, urlparse
from collections import deque
def crawl_for_emails(start_url: str, max_pages: int = 50) -> dict:
"""Crawl a site and collect all emails found."""
parsed_start = urlparse(start_url)
base_domain = parsed_start.netloc
visited = set()
queue = deque([start_url])
all_emails = {}
while queue and len(visited) < max_pages:
url = queue.popleft()
if url in visited:
continue
visited.add(url)
emails = scrape_emails(url)
if emails:
all_emails[url] = emails
# Find links to same domain
try:
response = requests.get(url, timeout=10)
soup = BeautifulSoup(response.text, 'html.parser')
for a in soup.find_all('a', href=True):
link = urljoin(url, a['href'])
parsed = urlparse(link)
# Only same domain, no fragments or external links
if (parsed.netloc == base_domain and
link not in visited and
not parsed.fragment and
parsed.scheme in ('http', 'https')):
queue.append(link)
except Exception:
pass
time.sleep(1) # 1 second between requests
return all_emails
Check robots.txt before crawling
from urllib.robotparser import RobotFileParser
from urllib.parse import urljoin
def is_allowed(url: str, user_agent: str = '*') -> bool:
"""Check if crawling this URL is allowed by robots.txt."""
parsed = urlparse(url)
robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
rp = RobotFileParser()
rp.set_url(robots_url)
try:
rp.read()
except Exception:
return True # If we can't fetch robots.txt, assume allowed
return rp.can_fetch(user_agent, url)
# Usage:
if is_allowed('https://example.com/contact', 'MyBot'):
emails = scrape_emails('https://example.com/contact')
else:
print("Robots.txt disallows scraping this URL")
Scrapy spider for email extraction
# scrapy_email/spiders/email_spider.py
import scrapy
import re
from urllib.parse import urljoin
class EmailSpider(scrapy.Spider):
name = 'email_spider'
EMAIL_PATTERN = re.compile(
r'\b[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}\b'
)
custom_settings = {
'DOWNLOAD_DELAY': 1,
'ROBOTSTXT_OBEY': True,
'CONCURRENT_REQUESTS': 1,
}
def __init__(self, start_url=None, *args, **kwargs):
super().__init__(*args, **kwargs)
self.start_urls = [start_url] if start_url else []
self.allowed_domains = [start_url.split('/')[2]] if start_url else []
def parse(self, response):
emails = set()
for href in response.css('a[href^="mailto:"]::attr(href)').getall():
email = href.replace('mailto:', '').split('?')[0].strip()
if email:
emails.add(email.lower())
for email in self.EMAIL_PATTERN.findall(response.text):
emails.add(email.lower())
if emails:
yield {'url': response.url, 'emails': list(emails)}
for link in response.css('a::attr(href)').getall():
yield response.follow(link, self.parse)
Handling JavaScript-rendered pages
Some contact pages load dynamically — requests can’t see them:
from playwright.sync_api import sync_playwright # pip install playwright
def scrape_js_page(url: str) -> list[str]:
"""Scrape emails from a JavaScript-rendered page."""
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
page.goto(url, wait_until='networkidle')
# Get rendered HTML
html = page.content()
# Extract mailto links
emails = set()
links = page.query_selector_all('a[href^="mailto:"]')
for link in links:
href = link.get_attribute('href') or ''
email = href.replace('mailto:', '').split('?')[0].strip()
if email:
emails.add(email.lower())
# From text content
text = page.inner_text('body')
for email in EMAIL_PATTERN.findall(text):
emails.add(email.lower())
browser.close()
return sorted(emails)
Legal and ethical considerations
- GDPR: In the EU, collecting personal data (including email addresses) without consent is restricted. Only collect emails you have a legitimate basis for.
- CAN-SPAM: In the US, harvested emails cannot be used for commercial email without compliance measures.
- robots.txt: Respect it.
ROBOTSTXT_OBEY = Truein Scrapy does this automatically. - Rate limits: Keep
DOWNLOAD_DELAY >= 1second. Aggressive crawling can harm the site. - Only scrape your own sites for production email databases.
Related tools
- Email & URL Extractor — extract emails without scraping
- Email Extractor Python — Python email extraction
- Extract Emails from HTML — parsing HTML for emails
Related posts
- Contact Information Extraction — Emails, Phones, and URLs from Text — Extract emails, phone numbers, URLs, and addresses from unstructured text using …
- Email Extractor — How to Pull Email Addresses from Text — An email extractor scans a block of text and finds all valid email addresses. He…
- Email Extractor — Extract Email Addresses from Text — An email extractor finds and pulls all email addresses from a block of text usin…
- Email Extractor in Python — regex, html.parser, and BeautifulSoup — Extract email addresses from plain text, HTML pages, and files using Python. Thi…
- Extract Emails from HTML — Parsing mailto Links and Text — Extract email addresses from HTML pages by scanning mailto: links, data attribut…
Related tool
Email & URL Extractor
Extract every email address and URL from a block of text. Regex-based, case-insensitive, deduplicated, sorted output.
Written by Mian Ali Khalid. Part of the Dev Productivity pillar.