Contact Information Extraction — Emails, Phones, and URLs from Text
Extract emails, phone numbers, URLs, and addresses from unstructured text using regex and NLP libraries. Includes Python and JavaScript implementations for bulk contact...
Use the tool
Email & URL Extractor
Extract every email address and URL from a block of text. Regex-based, case-insensitive, deduplicated, sorted output.
Extracting contact information from unstructured text combines regex for emails/phones/URLs with NLP for names and addresses. Here’s a comprehensive Python approach covering all common contact types.
Use the Email & URL Extractor to extract contacts from a single document.
Complete contact extraction (Python)
import re
from dataclasses import dataclass, field
from typing import Optional
@dataclass
class ContactInfo:
emails: list[str] = field(default_factory=list)
phones: list[str] = field(default_factory=list)
urls: list[str] = field(default_factory=list)
linkedin: list[str] = field(default_factory=list)
twitter: list[str] = field(default_factory=list)
PATTERNS = {
'email': re.compile(r'\b[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}\b'),
'phone': re.compile(
r'(?:'
r'\+?1?\s*[\-.]?\s*\(?\d{3}\)?[\s\-.]?\d{3}[\s\-.]?\d{4}' # US
r'|'
r'\+[1-9]\d{1,14}' # E.164 international
r')'
),
'url': re.compile(r'https?://[^\s<>"{}|\\^`\[\]]+'),
'linkedin': re.compile(r'linkedin\.com/in/[\w\-]+'),
'twitter': re.compile(r'(?:twitter\.com|x\.com)/(\w+)|@(\w{1,15})(?!\w)'),
}
def extract_contacts(text: str) -> ContactInfo:
info = ContactInfo()
info.emails = list(set(e.lower() for e in PATTERNS['email'].findall(text)))
info.phones = list(set(PATTERNS['phone'].findall(text)))
info.urls = list(set(PATTERNS['url'].findall(text)))
info.linkedin = list(set(PATTERNS['linkedin'].findall(text)))
# Twitter: either @handle or twitter.com/handle
twitter_matches = PATTERNS['twitter'].findall(text)
info.twitter = list(set(
(m[0] or m[1]).lower()
for m in twitter_matches
if any(m)
))
return info
# Usage:
text = """
Contact John Smith at john.smith@company.com or call (555) 123-4567.
LinkedIn: linkedin.com/in/johnsmith | Twitter: @johnsmith
Website: https://johnsmith.dev
International: +44 7911 123456
"""
contacts = extract_contacts(text)
print(f"Emails: {contacts.emails}")
print(f"Phones: {contacts.phones}")
print(f"URLs: {contacts.urls}")
print(f"LinkedIn: {contacts.linkedin}")
Normalize extracted phone numbers
import phonenumbers
def normalize_phones(raw_phones: list[str], region: str = 'US') -> list[str]:
"""Convert extracted phone strings to E.164 format."""
normalized = []
for phone in raw_phones:
try:
parsed = phonenumbers.parse(phone, region)
if phonenumbers.is_valid_number(parsed):
e164 = phonenumbers.format_number(
parsed,
phonenumbers.PhoneNumberFormat.E164
)
normalized.append(e164)
except phonenumbers.NumberParseException:
pass
return list(set(normalized))
normalize_phones(['(555) 123-4567', '+1 555-123-4567', '5551234567'])
# ['+15551234567'] — deduplicated
JavaScript contact extraction
const CONTACT_PATTERNS = {
email: /\b[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}\b/g,
phone: /(?:\+?1\s*[-.]?\s*)?(?:\(?\d{3}\)?[\s-.]?)?\d{3}[\s-.]?\d{4}/g,
url: /https?:\/\/[^\s<>"{}|\\^`\[\]]+/g,
linkedin: /linkedin\.com\/in\/[\w\-]+/g,
};
function extractContacts(text) {
const results = {};
for (const [type, pattern] of Object.entries(CONTACT_PATTERNS)) {
const matches = [...new Set(text.match(pattern) || [])];
if (matches.length) results[type] = matches;
}
return results;
}
Process multiple files / business cards
import json
from pathlib import Path
def process_business_cards(texts: list[str]) -> list[dict]:
"""Extract contacts from a list of business card texts."""
results = []
for text in texts:
contacts = extract_contacts(text)
results.append({
'raw': text[:100], # First 100 chars for reference
'emails': contacts.emails,
'phones': normalize_phones(contacts.phones) if contacts.phones else [],
'urls': contacts.urls[:3], # Max 3 URLs
'social': {
'linkedin': contacts.linkedin,
'twitter': contacts.twitter,
}
})
return [r for r in results if any([r['emails'], r['phones'], r['urls']])]
# Export to CSV:
import csv
def export_to_csv(contacts: list[dict], output_path: str):
with open(output_path, 'w', newline='') as f:
writer = csv.DictWriter(f, fieldnames=['email', 'phone', 'url', 'linkedin'])
writer.writeheader()
for contact in contacts:
writer.writerow({
'email': '; '.join(contact['emails']),
'phone': '; '.join(contact['phones']),
'url': '; '.join(contact['urls']),
'linkedin': '; '.join(contact['social']['linkedin']),
})
Related tools
- Email & URL Extractor — extract contacts online
- Email Extractor Python — Python email extraction
- Phone Number Regex — phone validation patterns
Related posts
- Email Extractor — How to Pull Email Addresses from Text — An email extractor scans a block of text and finds all valid email addresses. He…
- Email Extractor — Extract Email Addresses from Text — An email extractor finds and pulls all email addresses from a block of text usin…
- Email Extractor in Python — regex, html.parser, and BeautifulSoup — Extract email addresses from plain text, HTML pages, and files using Python. Thi…
- Extract Emails from HTML — Parsing mailto Links and Text — Extract email addresses from HTML pages by scanning mailto: links, data attribut…
- Phone Number Regex — Validate and Extract Phone Numbers — Phone number regex patterns for validation and extraction. Covers US phone numbe…
Related tool
Email & URL Extractor
Extract every email address and URL from a block of text. Regex-based, case-insensitive, deduplicated, sorted output.
Written by Mian Ali Khalid. Part of the Dev Productivity pillar.