Writen in python below in a module built n well explained to process csv files of
any type,, just follow thte impemention to the end
.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
LEO CSV Processor
This module processes CSV files for intent generation.
"""
import os
import logging
import csv
import pandas as pd
from collections import Counter
class CSVProcessor:
"""Processes CSV files for intent generation."""
def __init__(self):
"""Initialize the CSV processor."""
self.on_progress = lambda p: None
self.on_status = lambda s: None
def process(self, file_path):
"""
Process a CSV file.
Args:
file_path (str): Path to the CSV file
Returns:
dict: Processed data
"""
try:
self.on_status(f"Processing CSV file: {[Link](file_path)}")
self.on_progress(10)
# Read file with pandas
df = pd.read_csv(file_path)
self.on_progress(30)
# Basic analysis
self.on_status("Analyzing CSV structure...")
# Get column information
columns = [Link]()
column_types = [Link].to_dict()
column_types = {col: str(dtype) for col, dtype in column_types.items()}
# Get basic statistics
num_rows = len(df)
num_cols = len(columns)
self.on_progress(50)
# Extract sample data
self.on_status("Extracting sample data...")
sample = [Link](5).to_dict(orient='records')
self.on_progress(70)
# Identify potential entities
self.on_status("Identifying potential entities...")
entities = self._identify_entities(df)
self.on_progress(90)
# Combine results
result = {
'columns': columns,
'column_types': column_types,
'num_rows': num_rows,
'num_cols': num_cols,
'sample': sample,
'entities': entities
}
self.on_progress(100)
self.on_status("CSV processing complete")
return result
except Exception as e:
[Link](f"Error processing CSV file: {str(e)}", exc_info=True)
raise
def _identify_entities(self, df):
"""
Identify potential entities in the CSV data.
Args:
df ([Link]): DataFrame to analyze
Returns:
dict: Dictionary of potential entities
"""
entities = {}
# Check for common entity columns
for col in [Link]:
col_lower = [Link]()
# Check for name-related columns
if any(name_term in col_lower for name_term in ['name', 'user',
'person', 'customer', 'client']):
if df[col].dtype == 'object': # String type
entities['names'] = df[col].dropna().unique().tolist()[:10] #
Limit to 10 examples
# Check for location-related columns
elif any(loc_term in col_lower for loc_term in ['city', 'state',
'country', 'address', 'location']):
if df[col].dtype == 'object':
entities['locations'] = df[col].dropna().unique().tolist()[:10]
# Check for date-related columns
elif any(date_term in col_lower for date_term in ['date', 'time',
'day', 'year', 'month']):
entities['dates'] = True
# Check for numeric columns that might be quantities
elif df[col].dtype in ['int64', 'float64']:
if 'quantities' not in entities:
entities['quantities'] = []
entities['quantities'].append(col)
return entities
This module processes CSV files for intent generation in case you are building an
intents based assistant or chatbot