X Xerobit

Email Extractor in Python — regex, html.parser, and BeautifulSoup

Extract email addresses from plain text, HTML pages, and files using Python. This guide covers regex patterns, BeautifulSoup scraping, bulk file scanning, and validation to...

Mian Ali Khalid · · 5 min read
Use the tool
Email & URL Extractor
Extract every email address and URL from a block of text. Regex-based, case-insensitive, deduplicated, sorted output.
Open Email & URL Extractor →

Python’s re module combined with BeautifulSoup handles most email extraction needs. The core challenge is a regex that catches real emails without matching false positives like user@.com or @mentions.

Use the Email & URL Extractor to extract emails directly in the browser.

Basic email regex

import re

EMAIL_PATTERN = r'\b[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}\b'

def extract_emails(text: str) -> list[str]:
    return re.findall(EMAIL_PATTERN, text, re.IGNORECASE)

text = """
Contact us at support@example.com or sales@company.co.uk.
Reach John at j.doe+work@subdomain.example.org.
Not valid: user@.com or @twitter
"""

print(extract_emails(text))
# ['support@example.com', 'sales@company.co.uk', 'j.doe+work@subdomain.example.org']

Extract emails from a file

def extract_emails_from_file(filepath: str) -> list[str]:
    with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
        content = f.read()
    emails = re.findall(EMAIL_PATTERN, content, re.IGNORECASE)
    return list(set(emails))  # deduplicate

# Scan an entire directory:
import os

def scan_directory(directory: str) -> dict[str, list[str]]:
    results = {}
    for root, _, files in os.walk(directory):
        for filename in files:
            if filename.endswith(('.txt', '.html', '.csv', '.md')):
                path = os.path.join(root, filename)
                emails = extract_emails_from_file(path)
                if emails:
                    results[path] = emails
    return results

Extract emails from HTML with BeautifulSoup

HTML often has emails in href="mailto:..." links as well as plain text:

from bs4 import BeautifulSoup
import requests

def extract_emails_from_url(url: str) -> list[str]:
    response = requests.get(url, timeout=10)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    emails = set()
    
    # From mailto: links
    for link in soup.find_all('a', href=True):
        href = link['href']
        if href.startswith('mailto:'):
            email = href[7:].split('?')[0]  # strip mailto: and query params
            emails.add(email.strip())
    
    # From page text
    page_text = soup.get_text()
    for email in re.findall(EMAIL_PATTERN, page_text):
        emails.add(email.lower())
    
    return sorted(emails)

Extract from obfuscated HTML

Sites sometimes obfuscate emails to foil scrapers:

def extract_obfuscated_emails(html: str) -> list[str]:
    soup = BeautifulSoup(html, 'html.parser')
    emails = set()
    
    # Handle [at] and [dot] substitution:
    text = soup.get_text()
    text_normalized = (text
        .replace('[at]', '@').replace('(at)', '@').replace(' at ', '@')
        .replace('[dot]', '.').replace('(dot)', '.').replace(' dot ', '.')
    )
    for email in re.findall(EMAIL_PATTERN, text_normalized):
        emails.add(email)
    
    # Handle HTML entities: @ = @, . = .
    import html as html_module
    decoded = html_module.unescape(html)
    for email in re.findall(EMAIL_PATTERN, decoded):
        emails.add(email)
    
    return list(emails)

Validate extracted emails

Regex extraction catches format — validation checks deliverability:

import dns.resolver  # pip install dnspython

def validate_email_domain(email: str) -> bool:
    """Check that the email domain has MX records."""
    domain = email.split('@')[1]
    try:
        dns.resolver.resolve(domain, 'MX')
        return True
    except (dns.resolver.NoAnswer, dns.resolver.NXDOMAIN, dns.exception.DNSException):
        return False

# Quick format-only validation:
def is_valid_email_format(email: str) -> bool:
    pattern = r'^[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}$'
    return bool(re.match(pattern, email))

# Batch extraction + validation:
def extract_and_validate(text: str) -> dict:
    raw = extract_emails(text)
    return {
        'total_found': len(raw),
        'valid': [e for e in raw if is_valid_email_format(e)],
        'invalid': [e for e in raw if not is_valid_email_format(e)],
    }

Bulk extraction from CSV

import csv

def extract_emails_from_csv(filepath: str, column: str) -> list[str]:
    emails = []
    with open(filepath, newline='', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            cell = row.get(column, '')
            found = re.findall(EMAIL_PATTERN, cell)
            emails.extend(found)
    return list(set(emails))

# Usage:
emails = extract_emails_from_csv('contacts.csv', 'email_address')

Command-line email extractor

#!/usr/bin/env python3
"""Extract emails from files or stdin."""
import sys
import re
import argparse

EMAIL_PATTERN = r'\b[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}\b'

def main():
    parser = argparse.ArgumentParser(description='Extract email addresses')
    parser.add_argument('files', nargs='*', help='Files to process')
    parser.add_argument('-u', '--unique', action='store_true', help='Remove duplicates')
    args = parser.parse_args()
    
    emails = []
    
    if args.files:
        for filepath in args.files:
            with open(filepath, encoding='utf-8', errors='ignore') as f:
                emails.extend(re.findall(EMAIL_PATTERN, f.read()))
    else:
        # Read from stdin
        emails.extend(re.findall(EMAIL_PATTERN, sys.stdin.read()))
    
    if args.unique:
        emails = list(dict.fromkeys(emails))  # deduplicate, preserve order
    
    for email in emails:
        print(email)

if __name__ == '__main__':
    main()
# Usage:
python extract_emails.py contacts.html -u
cat emails.txt | python extract_emails.py -u

Related posts

Related tool

Email & URL Extractor

Extract every email address and URL from a block of text. Regex-based, case-insensitive, deduplicated, sorted output.

Written by Mian Ali Khalid. Part of the Dev Productivity pillar.