CSV Data Validation — Schema Validation, Type Checking, and Error Reporting
Validate CSV files before importing them into a database or processing pipeline. Learn how to check required columns, validate data types, enforce email/date/numeric formats,...
Use the tool
CSV to JSON Converter
Convert CSV files to JSON with proper quoting and escaping.
Validating CSV files before import prevents bad data from entering your database. A good validator checks required columns, data types, email formats, date ranges, and reports errors by row number.
Convert validated CSV to JSON with the CSV to JSON Converter.
Python CSV validator
import csv
import re
from dataclasses import dataclass, field
from typing import Callable, Any
from datetime import datetime
@dataclass
class ValidationError:
row: int
column: str
value: str
message: str
def __str__(self):
return f"Row {self.row}, column '{self.column}': {self.message} (got: '{self.value}')"
@dataclass
class ColumnSchema:
name: str
required: bool = True
type: str = 'string' # 'string', 'integer', 'float', 'email', 'date', 'boolean'
date_format: str = '%Y-%m-%d'
min_length: int = 0
max_length: int = 10000
pattern: str | None = None
allowed_values: list | None = None
EMAIL_PATTERN = re.compile(r'^[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}$')
def validate_value(schema: ColumnSchema, value: str, row: int) -> list[ValidationError]:
errors = []
col = schema.name
# Required check:
if schema.required and not value.strip():
errors.append(ValidationError(row, col, value, 'Required field is empty'))
return errors # No point validating empty required field further
if not value.strip():
return errors # Optional and empty — OK
# Type check:
if schema.type == 'integer':
try:
int(value.strip())
except ValueError:
errors.append(ValidationError(row, col, value, 'Must be an integer'))
elif schema.type == 'float':
try:
float(value.strip())
except ValueError:
errors.append(ValidationError(row, col, value, 'Must be a number'))
elif schema.type == 'email':
if not EMAIL_PATTERN.match(value.strip()):
errors.append(ValidationError(row, col, value, 'Invalid email format'))
elif schema.type == 'date':
try:
datetime.strptime(value.strip(), schema.date_format)
except ValueError:
errors.append(ValidationError(row, col, value,
f'Invalid date format (expected {schema.date_format})'))
elif schema.type == 'boolean':
if value.strip().lower() not in ('true', 'false', '1', '0', 'yes', 'no'):
errors.append(ValidationError(row, col, value, 'Must be true/false/1/0/yes/no'))
# Length check:
if schema.min_length and len(value) < schema.min_length:
errors.append(ValidationError(row, col, value,
f'Too short (min {schema.min_length} chars)'))
if len(value) > schema.max_length:
errors.append(ValidationError(row, col, value,
f'Too long (max {schema.max_length} chars)'))
# Pattern check:
if schema.pattern and not re.match(schema.pattern, value):
errors.append(ValidationError(row, col, value, f'Does not match pattern'))
# Allowed values:
if schema.allowed_values and value not in schema.allowed_values:
errors.append(ValidationError(row, col, value,
f'Must be one of: {", ".join(schema.allowed_values)}'))
return errors
def validate_csv(file_path: str, schema: list[ColumnSchema]) -> list[ValidationError]:
"""Validate a CSV file against a schema. Returns list of errors."""
all_errors = []
schema_map = {s.name: s for s in schema}
required_cols = {s.name for s in schema if s.required}
with open(file_path, newline='', encoding='utf-8') as f:
reader = csv.DictReader(f)
# Check required columns exist:
if reader.fieldnames is None:
all_errors.append(ValidationError(0, '', '', 'File is empty or missing header'))
return all_errors
actual_cols = set(reader.fieldnames)
missing_cols = required_cols - actual_cols
if missing_cols:
all_errors.append(ValidationError(
0, '', '', f'Missing required columns: {", ".join(sorted(missing_cols))}'
))
return all_errors
for row_num, row in enumerate(reader, start=2): # 2 = first data row
for col_name, col_schema in schema_map.items():
value = row.get(col_name, '')
errors = validate_value(col_schema, value or '', row_num)
all_errors.extend(errors)
return all_errors
Define and run a schema
# Define schema for a user import CSV:
USER_SCHEMA = [
ColumnSchema('name', required=True, min_length=2, max_length=100),
ColumnSchema('email', required=True, type='email'),
ColumnSchema('age', required=False, type='integer'),
ColumnSchema('join_date', required=False, type='date', date_format='%Y-%m-%d'),
ColumnSchema('role', required=True, allowed_values=['admin', 'editor', 'viewer']),
ColumnSchema('active', required=False, type='boolean'),
]
errors = validate_csv('users.csv', USER_SCHEMA)
if not errors:
print("✅ CSV is valid — ready to import")
else:
print(f"❌ {len(errors)} validation errors found:")
for err in errors[:20]: # Show first 20 errors
print(f" {err}")
pandas-based validation
import pandas as pd
import re
def validate_users_df(df: pd.DataFrame) -> list[str]:
errors = []
# Required columns:
for col in ['name', 'email']:
if col not in df.columns:
errors.append(f"Missing required column: {col}")
if errors:
return errors
# Null checks:
null_names = df[df['name'].isnull() | (df['name'].str.strip() == '')].index.tolist()
if null_names:
errors.append(f"Rows with empty name: {null_names[:10]}")
# Email format:
email_pattern = r'^[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}$'
invalid_emails = df[~df['email'].str.match(email_pattern, na=False)].index.tolist()
if invalid_emails:
errors.append(f"Rows with invalid email: {invalid_emails[:10]}")
# Age range (if column exists):
if 'age' in df.columns:
df['age'] = pd.to_numeric(df['age'], errors='coerce')
invalid_ages = df[(df['age'].notna()) & ((df['age'] < 0) | (df['age'] > 150))].index
if len(invalid_ages):
errors.append(f"Rows with invalid age: {invalid_ages[:10].tolist()}")
# Duplicate emails:
dupes = df[df['email'].duplicated(keep=False)]['email'].unique()
if len(dupes):
errors.append(f"Duplicate emails: {list(dupes[:5])}")
return errors
JavaScript: validate before upload
import Papa from 'papaparse';
const REQUIRED_COLUMNS = ['name', 'email'];
const EMAIL_REGEX = /^[^\s@]+@[^\s@]+\.[^\s@]+$/;
function validateCSV(file) {
return new Promise((resolve) => {
const errors = [];
Papa.parse(file, {
header: true,
skipEmptyLines: true,
complete: ({ data, meta }) => {
// Check required columns:
const missing = REQUIRED_COLUMNS.filter(c => !meta.fields?.includes(c));
if (missing.length) {
errors.push(`Missing columns: ${missing.join(', ')}`);
return resolve({ valid: false, errors });
}
// Validate each row:
data.forEach((row, i) => {
const rowNum = i + 2; // 2 = first data row
if (!row.name?.trim()) {
errors.push(`Row ${rowNum}: name is required`);
}
if (!row.email?.trim()) {
errors.push(`Row ${rowNum}: email is required`);
} else if (!EMAIL_REGEX.test(row.email.trim())) {
errors.push(`Row ${rowNum}: invalid email "${row.email}"`);
}
});
resolve({ valid: errors.length === 0, errors });
},
});
});
}
Related tools
- CSV to JSON Converter — convert validated CSV to JSON
- JSON Formatter — validate the resulting JSON
- Regex Tester — test validation patterns
Related posts
- CSV Quoting and Escaping Rules (the Real Ones, Not the Folklore) — CSV looks trivial until your spreadsheet has a comma in a name field. Here's the…
- Import CSV to Database — PostgreSQL, MySQL, SQLite, and Node.js — Import CSV files into PostgreSQL, MySQL, and SQLite using COPY commands, LOAD DA…
- CSV Format Guide — Structure, Delimiters, and Common Parsing Issues — CSV (Comma-Separated Values) is a simple tabular text format. Here's the RFC 418…
- CSV with Python pandas — read_csv, to_json, Data Cleaning, and Export — Read, clean, and convert CSV files with Python pandas. Covers pd.read_csv() opti…
Related tool
CSV to JSON Converter
Convert CSV files to JSON with proper quoting and escaping.
Written by Mian Ali Khalid. Part of the Data & Format pillar.