Email Extractor in Python — regex, html.parser, and BeautifulSoup
Extract email addresses from plain text, HTML pages, and files using Python. This guide covers regex patterns, BeautifulSoup scraping, bulk file scanning, and validation to...
Python’s re module combined with BeautifulSoup handles most email extraction needs. The core challenge is a regex that catches real emails without matching false positives like user@.com or @mentions.
Use the Email & URL Extractor to extract emails directly in the browser.
Basic email regex
import re
EMAIL_PATTERN = r'\b[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}\b'
def extract_emails(text: str) -> list[str]:
return re.findall(EMAIL_PATTERN, text, re.IGNORECASE)
text = """
Contact us at support@example.com or sales@company.co.uk.
Reach John at j.doe+work@subdomain.example.org.
Not valid: user@.com or @twitter
"""
print(extract_emails(text))
# ['support@example.com', 'sales@company.co.uk', 'j.doe+work@subdomain.example.org']
Extract emails from a file
def extract_emails_from_file(filepath: str) -> list[str]:
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
content = f.read()
emails = re.findall(EMAIL_PATTERN, content, re.IGNORECASE)
return list(set(emails)) # deduplicate
# Scan an entire directory:
import os
def scan_directory(directory: str) -> dict[str, list[str]]:
results = {}
for root, _, files in os.walk(directory):
for filename in files:
if filename.endswith(('.txt', '.html', '.csv', '.md')):
path = os.path.join(root, filename)
emails = extract_emails_from_file(path)
if emails:
results[path] = emails
return results
Extract emails from HTML with BeautifulSoup
HTML often has emails in href="mailto:..." links as well as plain text:
from bs4 import BeautifulSoup
import requests
def extract_emails_from_url(url: str) -> list[str]:
response = requests.get(url, timeout=10)
soup = BeautifulSoup(response.text, 'html.parser')
emails = set()
# From mailto: links
for link in soup.find_all('a', href=True):
href = link['href']
if href.startswith('mailto:'):
email = href[7:].split('?')[0] # strip mailto: and query params
emails.add(email.strip())
# From page text
page_text = soup.get_text()
for email in re.findall(EMAIL_PATTERN, page_text):
emails.add(email.lower())
return sorted(emails)
Extract from obfuscated HTML
Sites sometimes obfuscate emails to foil scrapers:
def extract_obfuscated_emails(html: str) -> list[str]:
soup = BeautifulSoup(html, 'html.parser')
emails = set()
# Handle [at] and [dot] substitution:
text = soup.get_text()
text_normalized = (text
.replace('[at]', '@').replace('(at)', '@').replace(' at ', '@')
.replace('[dot]', '.').replace('(dot)', '.').replace(' dot ', '.')
)
for email in re.findall(EMAIL_PATTERN, text_normalized):
emails.add(email)
# Handle HTML entities: @ = @, . = .
import html as html_module
decoded = html_module.unescape(html)
for email in re.findall(EMAIL_PATTERN, decoded):
emails.add(email)
return list(emails)
Validate extracted emails
Regex extraction catches format — validation checks deliverability:
import dns.resolver # pip install dnspython
def validate_email_domain(email: str) -> bool:
"""Check that the email domain has MX records."""
domain = email.split('@')[1]
try:
dns.resolver.resolve(domain, 'MX')
return True
except (dns.resolver.NoAnswer, dns.resolver.NXDOMAIN, dns.exception.DNSException):
return False
# Quick format-only validation:
def is_valid_email_format(email: str) -> bool:
pattern = r'^[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}$'
return bool(re.match(pattern, email))
# Batch extraction + validation:
def extract_and_validate(text: str) -> dict:
raw = extract_emails(text)
return {
'total_found': len(raw),
'valid': [e for e in raw if is_valid_email_format(e)],
'invalid': [e for e in raw if not is_valid_email_format(e)],
}
Bulk extraction from CSV
import csv
def extract_emails_from_csv(filepath: str, column: str) -> list[str]:
emails = []
with open(filepath, newline='', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
cell = row.get(column, '')
found = re.findall(EMAIL_PATTERN, cell)
emails.extend(found)
return list(set(emails))
# Usage:
emails = extract_emails_from_csv('contacts.csv', 'email_address')
Command-line email extractor
#!/usr/bin/env python3
"""Extract emails from files or stdin."""
import sys
import re
import argparse
EMAIL_PATTERN = r'\b[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}\b'
def main():
parser = argparse.ArgumentParser(description='Extract email addresses')
parser.add_argument('files', nargs='*', help='Files to process')
parser.add_argument('-u', '--unique', action='store_true', help='Remove duplicates')
args = parser.parse_args()
emails = []
if args.files:
for filepath in args.files:
with open(filepath, encoding='utf-8', errors='ignore') as f:
emails.extend(re.findall(EMAIL_PATTERN, f.read()))
else:
# Read from stdin
emails.extend(re.findall(EMAIL_PATTERN, sys.stdin.read()))
if args.unique:
emails = list(dict.fromkeys(emails)) # deduplicate, preserve order
for email in emails:
print(email)
if __name__ == '__main__':
main()
# Usage:
python extract_emails.py contacts.html -u
cat emails.txt | python extract_emails.py -u
Related tools
- Email & URL Extractor — extract emails and URLs online
- Extract URLs from Text — URL extraction patterns
Related posts
- Contact Information Extraction — Emails, Phones, and URLs from Text — Extract emails, phone numbers, URLs, and addresses from unstructured text using …
- Email Extractor — How to Pull Email Addresses from Text — An email extractor scans a block of text and finds all valid email addresses. He…
- Email Extractor — Extract Email Addresses from Text — An email extractor finds and pulls all email addresses from a block of text usin…
- Extract URLs from Text — Regex and Libraries for URL Detection — Extracting URLs from plain text requires a regex that handles http, https, and v…
- Web Scraping Email Addresses — Ethical Practices and Python Tools — Scrape email addresses from websites ethically using Python requests, BeautifulS…
Related tool
Extract every email address and URL from a block of text. Regex-based, case-insensitive, deduplicated, sorted output.
Written by Mian Ali Khalid. Part of the Dev Productivity pillar.