X Xerobit

CSV Data Validation — Schema Validation, Type Checking, and Error Reporting

Validate CSV files before importing them into a database or processing pipeline. Learn how to check required columns, validate data types, enforce email/date/numeric formats,...

Mian Ali Khalid · · 5 min read
Use the tool
CSV to JSON Converter
Convert CSV files to JSON with proper quoting and escaping.
Open CSV to JSON Converter →

Validating CSV files before import prevents bad data from entering your database. A good validator checks required columns, data types, email formats, date ranges, and reports errors by row number.

Convert validated CSV to JSON with the CSV to JSON Converter.

Python CSV validator

import csv
import re
from dataclasses import dataclass, field
from typing import Callable, Any
from datetime import datetime

@dataclass
class ValidationError:
    row: int
    column: str
    value: str
    message: str

    def __str__(self):
        return f"Row {self.row}, column '{self.column}': {self.message} (got: '{self.value}')"

@dataclass
class ColumnSchema:
    name: str
    required: bool = True
    type: str = 'string'       # 'string', 'integer', 'float', 'email', 'date', 'boolean'
    date_format: str = '%Y-%m-%d'
    min_length: int = 0
    max_length: int = 10000
    pattern: str | None = None
    allowed_values: list | None = None

EMAIL_PATTERN = re.compile(r'^[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}$')

def validate_value(schema: ColumnSchema, value: str, row: int) -> list[ValidationError]:
    errors = []
    col = schema.name

    # Required check:
    if schema.required and not value.strip():
        errors.append(ValidationError(row, col, value, 'Required field is empty'))
        return errors  # No point validating empty required field further

    if not value.strip():
        return errors  # Optional and empty — OK

    # Type check:
    if schema.type == 'integer':
        try:
            int(value.strip())
        except ValueError:
            errors.append(ValidationError(row, col, value, 'Must be an integer'))
    elif schema.type == 'float':
        try:
            float(value.strip())
        except ValueError:
            errors.append(ValidationError(row, col, value, 'Must be a number'))
    elif schema.type == 'email':
        if not EMAIL_PATTERN.match(value.strip()):
            errors.append(ValidationError(row, col, value, 'Invalid email format'))
    elif schema.type == 'date':
        try:
            datetime.strptime(value.strip(), schema.date_format)
        except ValueError:
            errors.append(ValidationError(row, col, value,
                f'Invalid date format (expected {schema.date_format})'))
    elif schema.type == 'boolean':
        if value.strip().lower() not in ('true', 'false', '1', '0', 'yes', 'no'):
            errors.append(ValidationError(row, col, value, 'Must be true/false/1/0/yes/no'))

    # Length check:
    if schema.min_length and len(value) < schema.min_length:
        errors.append(ValidationError(row, col, value,
            f'Too short (min {schema.min_length} chars)'))
    if len(value) > schema.max_length:
        errors.append(ValidationError(row, col, value,
            f'Too long (max {schema.max_length} chars)'))

    # Pattern check:
    if schema.pattern and not re.match(schema.pattern, value):
        errors.append(ValidationError(row, col, value, f'Does not match pattern'))

    # Allowed values:
    if schema.allowed_values and value not in schema.allowed_values:
        errors.append(ValidationError(row, col, value,
            f'Must be one of: {", ".join(schema.allowed_values)}'))

    return errors


def validate_csv(file_path: str, schema: list[ColumnSchema]) -> list[ValidationError]:
    """Validate a CSV file against a schema. Returns list of errors."""
    all_errors = []
    schema_map = {s.name: s for s in schema}
    required_cols = {s.name for s in schema if s.required}

    with open(file_path, newline='', encoding='utf-8') as f:
        reader = csv.DictReader(f)

        # Check required columns exist:
        if reader.fieldnames is None:
            all_errors.append(ValidationError(0, '', '', 'File is empty or missing header'))
            return all_errors

        actual_cols = set(reader.fieldnames)
        missing_cols = required_cols - actual_cols
        if missing_cols:
            all_errors.append(ValidationError(
                0, '', '', f'Missing required columns: {", ".join(sorted(missing_cols))}'
            ))
            return all_errors

        for row_num, row in enumerate(reader, start=2):  # 2 = first data row
            for col_name, col_schema in schema_map.items():
                value = row.get(col_name, '')
                errors = validate_value(col_schema, value or '', row_num)
                all_errors.extend(errors)

    return all_errors

Define and run a schema

# Define schema for a user import CSV:
USER_SCHEMA = [
    ColumnSchema('name', required=True, min_length=2, max_length=100),
    ColumnSchema('email', required=True, type='email'),
    ColumnSchema('age', required=False, type='integer'),
    ColumnSchema('join_date', required=False, type='date', date_format='%Y-%m-%d'),
    ColumnSchema('role', required=True, allowed_values=['admin', 'editor', 'viewer']),
    ColumnSchema('active', required=False, type='boolean'),
]

errors = validate_csv('users.csv', USER_SCHEMA)

if not errors:
    print("✅ CSV is valid — ready to import")
else:
    print(f"❌ {len(errors)} validation errors found:")
    for err in errors[:20]:  # Show first 20 errors
        print(f"  {err}")

pandas-based validation

import pandas as pd
import re

def validate_users_df(df: pd.DataFrame) -> list[str]:
    errors = []

    # Required columns:
    for col in ['name', 'email']:
        if col not in df.columns:
            errors.append(f"Missing required column: {col}")
    if errors:
        return errors

    # Null checks:
    null_names = df[df['name'].isnull() | (df['name'].str.strip() == '')].index.tolist()
    if null_names:
        errors.append(f"Rows with empty name: {null_names[:10]}")

    # Email format:
    email_pattern = r'^[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}$'
    invalid_emails = df[~df['email'].str.match(email_pattern, na=False)].index.tolist()
    if invalid_emails:
        errors.append(f"Rows with invalid email: {invalid_emails[:10]}")

    # Age range (if column exists):
    if 'age' in df.columns:
        df['age'] = pd.to_numeric(df['age'], errors='coerce')
        invalid_ages = df[(df['age'].notna()) & ((df['age'] < 0) | (df['age'] > 150))].index
        if len(invalid_ages):
            errors.append(f"Rows with invalid age: {invalid_ages[:10].tolist()}")

    # Duplicate emails:
    dupes = df[df['email'].duplicated(keep=False)]['email'].unique()
    if len(dupes):
        errors.append(f"Duplicate emails: {list(dupes[:5])}")

    return errors

JavaScript: validate before upload

import Papa from 'papaparse';

const REQUIRED_COLUMNS = ['name', 'email'];
const EMAIL_REGEX = /^[^\s@]+@[^\s@]+\.[^\s@]+$/;

function validateCSV(file) {
  return new Promise((resolve) => {
    const errors = [];

    Papa.parse(file, {
      header: true,
      skipEmptyLines: true,
      complete: ({ data, meta }) => {
        // Check required columns:
        const missing = REQUIRED_COLUMNS.filter(c => !meta.fields?.includes(c));
        if (missing.length) {
          errors.push(`Missing columns: ${missing.join(', ')}`);
          return resolve({ valid: false, errors });
        }

        // Validate each row:
        data.forEach((row, i) => {
          const rowNum = i + 2; // 2 = first data row

          if (!row.name?.trim()) {
            errors.push(`Row ${rowNum}: name is required`);
          }
          if (!row.email?.trim()) {
            errors.push(`Row ${rowNum}: email is required`);
          } else if (!EMAIL_REGEX.test(row.email.trim())) {
            errors.push(`Row ${rowNum}: invalid email "${row.email}"`);
          }
        });

        resolve({ valid: errors.length === 0, errors });
      },
    });
  });
}

Related posts

Related tool

CSV to JSON Converter

Convert CSV files to JSON with proper quoting and escaping.

Written by Mian Ali Khalid. Part of the Data & Format pillar.