X Xerobit

CSV with Python pandas — read_csv, to_json, Data Cleaning, and Export

Read, clean, and convert CSV files with Python pandas. Covers pd.read_csv() options, handling missing values, type conversion, filtering, groupby aggregations, and exporting to...

Mian Ali Khalid · · 5 min read
Use the tool
CSV to JSON Converter
Convert CSV files to JSON with proper quoting and escaping.
Open CSV to JSON Converter →

pandas is the standard Python library for CSV processing. It handles encoding, type inference, missing values, and large files efficiently.

Convert CSV to JSON online with the CSV to JSON Converter.

Read a CSV file

import pandas as pd

# Basic read:
df = pd.read_csv('data.csv')

# Common options:
df = pd.read_csv(
    'data.csv',
    sep=',',                    # Delimiter (default: ',')
    header=0,                   # Row number for column names (0 = first row)
    index_col=None,             # Column to use as row index
    usecols=['name', 'email'],  # Read only specific columns
    dtype={'age': int, 'active': bool},  # Force column types
    na_values=['N/A', 'n/a', '-', ''],   # Treat as NaN
    encoding='utf-8',           # File encoding
    skiprows=2,                 # Skip first 2 rows
    nrows=1000,                 # Read only 1000 rows
    chunksize=None,             # For large files (see below)
)

# Preview:
df.head(5)      # First 5 rows
df.dtypes       # Column types
df.shape        # (rows, columns)
df.info()       # Non-null counts and types
df.describe()   # Statistics for numeric columns

Handle different CSV formats

# Tab-separated (TSV):
df = pd.read_csv('data.tsv', sep='\t')

# Semicolon-separated (European CSV):
df = pd.read_csv('data.csv', sep=';', decimal=',')

# No header row:
df = pd.read_csv('data.csv', header=None, names=['id', 'name', 'email'])

# Multiple header rows (skip metadata):
df = pd.read_csv('data.csv', skiprows=3)

# Skip footer rows:
df = pd.read_csv('data.csv', skipfooter=2, engine='python')

Data cleaning

# Check missing values:
df.isnull().sum()  # Count NaN per column

# Drop rows with any NaN:
df_clean = df.dropna()

# Drop rows where specific columns are NaN:
df_clean = df.dropna(subset=['email', 'phone'])

# Fill NaN with default values:
df['age'] = df['age'].fillna(df['age'].median())
df['name'] = df['name'].fillna('Unknown')

# Remove duplicates:
df = df.drop_duplicates()
df = df.drop_duplicates(subset=['email'])  # Deduplicate on email column

# Trim whitespace from string columns:
str_cols = df.select_dtypes(include='object').columns
df[str_cols] = df[str_cols].apply(lambda col: col.str.strip())

# Normalize email:
df['email'] = df['email'].str.lower().str.strip()

# Convert types:
df['age'] = pd.to_numeric(df['age'], errors='coerce')  # NaN for non-numeric
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d', errors='coerce')
df['active'] = df['active'].map({'true': True, 'false': False, '1': True, '0': False})

Filter and transform

# Filter rows:
active_users = df[df['active'] == True]
adults = df[df['age'] >= 18]
has_email = df[df['email'].notna() & (df['email'] != '')]

# Select columns:
contact_info = df[['name', 'email', 'phone']]

# Add computed column:
df['full_name'] = df['first_name'] + ' ' + df['last_name']
df['age_group'] = pd.cut(df['age'], bins=[0, 18, 35, 65, 100],
                         labels=['minor', 'young', 'adult', 'senior'])

# Groupby aggregation:
stats = df.groupby('department').agg(
    count=('id', 'count'),
    avg_age=('age', 'mean'),
    emails=('email', list),
).reset_index()

Export to JSON

# Records format (most common for APIs):
df.to_json('output.json', orient='records', indent=2)
# [{"name": "Alice", "age": 30}, {"name": "Bob", "age": 25}]

# Orient options:
df.to_json(orient='records')   # List of row dicts
df.to_json(orient='columns')   # Dict of column lists
df.to_json(orient='index')     # Dict keyed by row index
df.to_json(orient='split')     # columns + data arrays (compact)
df.to_json(orient='table')     # JSON Table Schema format

# Handle dates properly:
df.to_json('output.json', orient='records', date_format='iso', indent=2)

# To Python dict/list (for further processing):
records = df.to_dict('records')
# [{'name': 'Alice', 'age': 30, ...}, ...]

Process large CSV files (chunking)

import pandas as pd
import json

def process_large_csv(csv_path: str, json_path: str, chunk_size: int = 10_000):
    """Convert a large CSV to JSON in chunks."""
    all_records = []
    
    for chunk in pd.read_csv(csv_path, chunksize=chunk_size):
        # Clean each chunk:
        chunk = chunk.dropna(subset=['email'])
        chunk['email'] = chunk['email'].str.lower().str.strip()
        
        all_records.extend(chunk.to_dict('records'))
    
    with open(json_path, 'w') as f:
        json.dump(all_records, f, indent=2, default=str)
    
    print(f"Processed {len(all_records)} records")

process_large_csv('large-export.csv', 'output.json')

Merge multiple CSV files

import pandas as pd
from pathlib import Path
import glob

# Concatenate all CSV files in a directory:
files = glob.glob('data/*.csv')
dfs = [pd.read_csv(f) for f in files]
combined = pd.concat(dfs, ignore_index=True)
combined = combined.drop_duplicates(subset=['id'])

combined.to_json('combined.json', orient='records', indent=2)
print(f"Combined {len(files)} files: {len(combined)} records")

Related posts

Related tool

CSV to JSON Converter

Convert CSV files to JSON with proper quoting and escaping.

Written by Mian Ali Khalid. Part of the Data & Format pillar.