CSV with Python pandas — read_csv, to_json, Data Cleaning, and Export

Read, clean, and convert CSV files with Python pandas. Covers pd.read_csv() options, handling missing values, type conversion, filtering, groupby aggregations, and exporting to...

Mian Ali Khalid · 2026-05-12 · 5 min read

Use the tool

CSV to JSON Converter

Convert CSV files to JSON with proper quoting and escaping.

Open CSV to JSON Converter →

pandas is the standard Python library for CSV processing. It handles encoding, type inference, missing values, and large files efficiently.

Convert CSV to JSON online with the CSV to JSON Converter.

Read a CSV file

import pandas as pd

# Basic read:
df = pd.read_csv('data.csv')

# Common options:
df = pd.read_csv(
    'data.csv',
    sep=',',                    # Delimiter (default: ',')
    header=0,                   # Row number for column names (0 = first row)
    index_col=None,             # Column to use as row index
    usecols=['name', 'email'],  # Read only specific columns
    dtype={'age': int, 'active': bool},  # Force column types
    na_values=['N/A', 'n/a', '-', ''],   # Treat as NaN
    encoding='utf-8',           # File encoding
    skiprows=2,                 # Skip first 2 rows
    nrows=1000,                 # Read only 1000 rows
    chunksize=None,             # For large files (see below)
)

# Preview:
df.head(5)      # First 5 rows
df.dtypes       # Column types
df.shape        # (rows, columns)
df.info()       # Non-null counts and types
df.describe()   # Statistics for numeric columns

Handle different CSV formats

# Tab-separated (TSV):
df = pd.read_csv('data.tsv', sep='\t')

# Semicolon-separated (European CSV):
df = pd.read_csv('data.csv', sep=';', decimal=',')

# No header row:
df = pd.read_csv('data.csv', header=None, names=['id', 'name', 'email'])

# Multiple header rows (skip metadata):
df = pd.read_csv('data.csv', skiprows=3)

# Skip footer rows:
df = pd.read_csv('data.csv', skipfooter=2, engine='python')

Data cleaning

# Check missing values:
df.isnull().sum()  # Count NaN per column

# Drop rows with any NaN:
df_clean = df.dropna()

# Drop rows where specific columns are NaN:
df_clean = df.dropna(subset=['email', 'phone'])

# Fill NaN with default values:
df['age'] = df['age'].fillna(df['age'].median())
df['name'] = df['name'].fillna('Unknown')

# Remove duplicates:
df = df.drop_duplicates()
df = df.drop_duplicates(subset=['email'])  # Deduplicate on email column

# Trim whitespace from string columns:
str_cols = df.select_dtypes(include='object').columns
df[str_cols] = df[str_cols].apply(lambda col: col.str.strip())

# Normalize email:
df['email'] = df['email'].str.lower().str.strip()

# Convert types:
df['age'] = pd.to_numeric(df['age'], errors='coerce')  # NaN for non-numeric
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d', errors='coerce')
df['active'] = df['active'].map({'true': True, 'false': False, '1': True, '0': False})

Filter and transform

# Filter rows:
active_users = df[df['active'] == True]
adults = df[df['age'] >= 18]
has_email = df[df['email'].notna() & (df['email'] != '')]

# Select columns:
contact_info = df[['name', 'email', 'phone']]

# Add computed column:
df['full_name'] = df['first_name'] + ' ' + df['last_name']
df['age_group'] = pd.cut(df['age'], bins=[0, 18, 35, 65, 100],
                         labels=['minor', 'young', 'adult', 'senior'])

# Groupby aggregation:
stats = df.groupby('department').agg(
    count=('id', 'count'),
    avg_age=('age', 'mean'),
    emails=('email', list),
).reset_index()

Export to JSON

# Records format (most common for APIs):
df.to_json('output.json', orient='records', indent=2)
# [{"name": "Alice", "age": 30}, {"name": "Bob", "age": 25}]

# Orient options:
df.to_json(orient='records')   # List of row dicts
df.to_json(orient='columns')   # Dict of column lists
df.to_json(orient='index')     # Dict keyed by row index
df.to_json(orient='split')     # columns + data arrays (compact)
df.to_json(orient='table')     # JSON Table Schema format

# Handle dates properly:
df.to_json('output.json', orient='records', date_format='iso', indent=2)

# To Python dict/list (for further processing):
records = df.to_dict('records')
# [{'name': 'Alice', 'age': 30, ...}, ...]

Process large CSV files (chunking)

import pandas as pd
import json

def process_large_csv(csv_path: str, json_path: str, chunk_size: int = 10_000):
    """Convert a large CSV to JSON in chunks."""
    all_records = []
    
    for chunk in pd.read_csv(csv_path, chunksize=chunk_size):
        # Clean each chunk:
        chunk = chunk.dropna(subset=['email'])
        chunk['email'] = chunk['email'].str.lower().str.strip()
        
        all_records.extend(chunk.to_dict('records'))
    
    with open(json_path, 'w') as f:
        json.dump(all_records, f, indent=2, default=str)
    
    print(f"Processed {len(all_records)} records")

process_large_csv('large-export.csv', 'output.json')

Merge multiple CSV files

import pandas as pd
from pathlib import Path
import glob

# Concatenate all CSV files in a directory:
files = glob.glob('data/*.csv')
dfs = [pd.read_csv(f) for f in files]
combined = pd.concat(dfs, ignore_index=True)
combined = combined.drop_duplicates(subset=['id'])

combined.to_json('combined.json', orient='records', indent=2)
print(f"Combined {len(files)} files: {len(combined)} records")

CSV to JSON Converter — convert CSV to JSON online
JSON Formatter — format and validate converted JSON
JSON Diff — compare JSON outputs

CSV Quoting and Escaping Rules (the Real Ones, Not the Folklore) — CSV looks trivial until your spreadsheet has a comma in a name field. Here's the…
CSV Data Validation — Schema Validation, Type Checking, and Error Reporting — Validate CSV files before importing them into a database or processing pipeline.…
Import CSV to Database — PostgreSQL, MySQL, SQLite, and Node.js — Import CSV files into PostgreSQL, MySQL, and SQLite using COPY commands, LOAD DA…
CSV Format Guide — Structure, Delimiters, and Common Parsing Issues — CSV (Comma-Separated Values) is a simple tabular text format. Here's the RFC 418…
CSV to JSON Converter — Transform Spreadsheet Data to JSON — CSV to JSON conversion turns rows and columns into an array of objects, using th…

Related tool