X Xerobit

Contact Information Extraction — Emails, Phones, and URLs from Text

Extract emails, phone numbers, URLs, and addresses from unstructured text using regex and NLP libraries. Includes Python and JavaScript implementations for bulk contact...

Mian Ali Khalid · · 5 min read
Use the tool
Email & URL Extractor
Extract every email address and URL from a block of text. Regex-based, case-insensitive, deduplicated, sorted output.
Open Email & URL Extractor →

Extracting contact information from unstructured text combines regex for emails/phones/URLs with NLP for names and addresses. Here’s a comprehensive Python approach covering all common contact types.

Use the Email & URL Extractor to extract contacts from a single document.

Complete contact extraction (Python)

import re
from dataclasses import dataclass, field
from typing import Optional

@dataclass
class ContactInfo:
    emails: list[str] = field(default_factory=list)
    phones: list[str] = field(default_factory=list)
    urls: list[str] = field(default_factory=list)
    linkedin: list[str] = field(default_factory=list)
    twitter: list[str] = field(default_factory=list)

PATTERNS = {
    'email': re.compile(r'\b[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}\b'),
    'phone': re.compile(
        r'(?:'
        r'\+?1?\s*[\-.]?\s*\(?\d{3}\)?[\s\-.]?\d{3}[\s\-.]?\d{4}'  # US
        r'|'
        r'\+[1-9]\d{1,14}'  # E.164 international
        r')'
    ),
    'url': re.compile(r'https?://[^\s<>"{}|\\^`\[\]]+'),
    'linkedin': re.compile(r'linkedin\.com/in/[\w\-]+'),
    'twitter': re.compile(r'(?:twitter\.com|x\.com)/(\w+)|@(\w{1,15})(?!\w)'),
}

def extract_contacts(text: str) -> ContactInfo:
    info = ContactInfo()
    
    info.emails = list(set(e.lower() for e in PATTERNS['email'].findall(text)))
    info.phones = list(set(PATTERNS['phone'].findall(text)))
    info.urls = list(set(PATTERNS['url'].findall(text)))
    info.linkedin = list(set(PATTERNS['linkedin'].findall(text)))
    
    # Twitter: either @handle or twitter.com/handle
    twitter_matches = PATTERNS['twitter'].findall(text)
    info.twitter = list(set(
        (m[0] or m[1]).lower()
        for m in twitter_matches
        if any(m)
    ))
    
    return info

# Usage:
text = """
Contact John Smith at john.smith@company.com or call (555) 123-4567.
LinkedIn: linkedin.com/in/johnsmith | Twitter: @johnsmith
Website: https://johnsmith.dev
International: +44 7911 123456
"""

contacts = extract_contacts(text)
print(f"Emails: {contacts.emails}")
print(f"Phones: {contacts.phones}")
print(f"URLs: {contacts.urls}")
print(f"LinkedIn: {contacts.linkedin}")

Normalize extracted phone numbers

import phonenumbers

def normalize_phones(raw_phones: list[str], region: str = 'US') -> list[str]:
    """Convert extracted phone strings to E.164 format."""
    normalized = []
    for phone in raw_phones:
        try:
            parsed = phonenumbers.parse(phone, region)
            if phonenumbers.is_valid_number(parsed):
                e164 = phonenumbers.format_number(
                    parsed,
                    phonenumbers.PhoneNumberFormat.E164
                )
                normalized.append(e164)
        except phonenumbers.NumberParseException:
            pass
    return list(set(normalized))

normalize_phones(['(555) 123-4567', '+1 555-123-4567', '5551234567'])
# ['+15551234567']  — deduplicated

JavaScript contact extraction

const CONTACT_PATTERNS = {
  email: /\b[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}\b/g,
  phone: /(?:\+?1\s*[-.]?\s*)?(?:\(?\d{3}\)?[\s-.]?)?\d{3}[\s-.]?\d{4}/g,
  url: /https?:\/\/[^\s<>"{}|\\^`\[\]]+/g,
  linkedin: /linkedin\.com\/in\/[\w\-]+/g,
};

function extractContacts(text) {
  const results = {};
  
  for (const [type, pattern] of Object.entries(CONTACT_PATTERNS)) {
    const matches = [...new Set(text.match(pattern) || [])];
    if (matches.length) results[type] = matches;
  }
  
  return results;
}

Process multiple files / business cards

import json
from pathlib import Path

def process_business_cards(texts: list[str]) -> list[dict]:
    """Extract contacts from a list of business card texts."""
    results = []
    
    for text in texts:
        contacts = extract_contacts(text)
        results.append({
            'raw': text[:100],  # First 100 chars for reference
            'emails': contacts.emails,
            'phones': normalize_phones(contacts.phones) if contacts.phones else [],
            'urls': contacts.urls[:3],  # Max 3 URLs
            'social': {
                'linkedin': contacts.linkedin,
                'twitter': contacts.twitter,
            }
        })
    
    return [r for r in results if any([r['emails'], r['phones'], r['urls']])]

# Export to CSV:
import csv

def export_to_csv(contacts: list[dict], output_path: str):
    with open(output_path, 'w', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=['email', 'phone', 'url', 'linkedin'])
        writer.writeheader()
        for contact in contacts:
            writer.writerow({
                'email': '; '.join(contact['emails']),
                'phone': '; '.join(contact['phones']),
                'url': '; '.join(contact['urls']),
                'linkedin': '; '.join(contact['social']['linkedin']),
            })

Related posts

Related tool

Email & URL Extractor

Extract every email address and URL from a block of text. Regex-based, case-insensitive, deduplicated, sorted output.

Written by Mian Ali Khalid. Part of the Dev Productivity pillar.