0% found this document useful (0 votes)

49 views9 pages

Text2SQL Agent for CSV Databases

Uploaded by

Avinash Reddy

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

49 views9 pages

Text2SQL Agent for CSV Databases

Uploaded by

Avinash Reddy

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 9

14_text2sql_agent

October 8, 2024

1 Text2SQL Agent to Interact with CSV Data

1.1 System Architecture
Think about it as an agent with a set of tools such as search_cache() and generate_SQL_query(),
and run_sql_query().

1.2 Data Ingestion Pipeline

1. Read CSV

1
2. Create Database schema
3. Create a table
4. Load table with CSV data

[1]: import pandas as pd

import sqlite3

def csv_to_sqlite(csv_file, db_name, table_name):

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(csv_file)

# Connect to the SQLite database (it will create the database file if it␣
↪doesn't exist)

conn = sqlite3.connect(db_name)
cursor = conn.cursor()

# Infer the schema based on the DataFrame columns and data types
def create_table_from_df(df, table_name):
# Get column names and types
col_types = []
for col in df.columns:
dtype = df[col].dtype
if dtype == 'int64':
col_type = 'INTEGER'
elif dtype == 'float64':
col_type = 'REAL'
else:
col_type = 'TEXT'
col_types.append(f'"{col}" {col_type}')

# Create the table schema

col_definitions = ", ".join(col_types)
create_table_query = f'CREATE TABLE IF NOT EXISTS {table_name}␣
↪({col_definitions});'

# print(create_table_query)

# Execute the table creation query

cursor.execute(create_table_query)
print(f"Table '{table_name}' created with schema: {col_definitions}")

# Create table schema

create_table_from_df(df, table_name)

# Insert CSV data into the SQLite table

df.to_sql(table_name, conn, if_exists='replace', index=False)

# Commit and close the connection

2
conn.commit()
conn.close()
print(f"Data loaded into '{table_name}' table in '{db_name}' SQLite␣
↪database.")

csv_file = "movies.csv"
db_name = "movies_db.db"
table_name = "movies"
csv_to_sqlite(csv_file, db_name, table_name)

Table 'movies' created with schema: "Movie" TEXT, "LeadStudio" TEXT,

"RottenTomatoes" REAL, "AudienceScore" REAL, "Story" TEXT, "Genre" TEXT,
"TheatersOpenWeek" REAL, "OpeningWeekend" REAL, "BOAvgOpenWeekend" REAL,
"DomesticGross" REAL, "ForeignGross" REAL, "WorldGross" REAL, "Budget" REAL,
"Profitability" REAL, "OpenProfit" REAL, "Year" INTEGER
Data loaded into 'movies' table in 'movies_db.db' SQLite database.

[2]: def run_sql_query(db_name, query):

"""
Executes a SQL query on a SQLite database and returns the results.

Args:
db_name (str): The name of the SQLite database file.
query (str): The SQL query to run.

Returns:
list: Query result as a list of tuples, or an empty list if no results␣
↪or error occurred.

"""
try:
# Connect to the SQLite database
conn = sqlite3.connect(db_name)
cursor = conn.cursor()

# Execute the SQL query

cursor.execute(query)

# Fetch all results

results = cursor.fetchall()

# Close the connection

conn.close()

# Return results or an empty list if no results were found

return results if results else []

except sqlite3.Error as e:

3
print(f"An error occurred while executing the query: {e}")
return []

[3]: query = f"SELECT count(*) FROM {table_name};"

results = run_sql_query(db_name, query)

if results:
for row in results:
print(row)

(970,)

1.3 Ask Natural Language Questions

[24]: import openai
import faiss
import numpy as np
import os
from openai import OpenAI
from litellm import completion
from IPython.display import Markdown, display

[5]: OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

client = OpenAI(api_key=OPENAI_API_KEY)

# Initialize the FAISS index

dimension = 1536 # Dimension size for OpenAI embeddings (may vary by model)
index = faiss.IndexFlatL2(dimension) # L2 distance index

# Cache will hold (user_question, sql_query, response)

cache = []

[6]: # Helper function to get embeddings from OpenAI or any embedding model
def get_embeddings(text):
"""
Converts a text string into a vector embedding using OpenAI embeddings.

Args:
text (str): The text string to convert.

Returns:
np.array: A vector representation of the text.
"""
response = client.embeddings.create(input=text,␣
↪model="text-embedding-3-small")

embedding = np.array(response.data[0].embedding)
return embedding

4
[31]: def search_cache(question_embedding, threshold=0.1):
"""
Searches the FAISS index for a similar question.

Args:
question_embedding (np.array): The embedding of the user's question.
threshold (float): The similarity threshold for considering a hit.

Returns:
tuple: (sql_query, response) if a hit is found, otherwise None.
"""
if index.ntotal > 0:
distances, indices = index.search(np.array([question_embedding]), k=1)
# print(distances)
# print(indices)
# Check if the closest distance is below the threshold
if distances[0][0] < threshold:
cache_index = indices[0][0]
return cache[cache_index][1], cache[cache_index][2]
return None

[16]: def get_table_schema(db_name, table_name):

"""
Retrieves the schema (columns and data types) for a given table in the␣
↪SQLite database.

Args:
db_name (str): The name of the SQLite database file.
table_name (str): The name of the table.

Returns:
list: A list of tuples with column name, data type, and other info.
"""
conn = sqlite3.connect(db_name)
cursor = conn.cursor()

# Use PRAGMA to get the table schema

cursor.execute(f"PRAGMA table_info({table_name});")
schema = cursor.fetchall()

conn.close()
return schema

table_name = 'movies'
schema = get_table_schema(db_name, table_name)
print(f"Schema for {table_name}:")
for col in schema:

5
print(col)

Schema for movies:

(0, 'Movie', 'TEXT', 0, None, 0)
(1, 'LeadStudio', 'TEXT', 0, None, 0)
(2, 'RottenTomatoes', 'REAL', 0, None, 0)
(3, 'AudienceScore', 'REAL', 0, None, 0)
(4, 'Story', 'TEXT', 0, None, 0)
(5, 'Genre', 'TEXT', 0, None, 0)
(6, 'TheatersOpenWeek', 'REAL', 0, None, 0)
(7, 'OpeningWeekend', 'REAL', 0, None, 0)
(8, 'BOAvgOpenWeekend', 'REAL', 0, None, 0)
(9, 'DomesticGross', 'REAL', 0, None, 0)
(10, 'ForeignGross', 'REAL', 0, None, 0)
(11, 'WorldGross', 'REAL', 0, None, 0)
(12, 'Budget', 'REAL', 0, None, 0)
(13, 'Profitability', 'REAL', 0, None, 0)
(14, 'OpenProfit', 'REAL', 0, None, 0)
(15, 'Year', 'INTEGER', 0, None, 0)

[25]: def generate_llm_prompt(table_name, table_schema):

"""
Generates a prompt to provide context about a table's schema for LLM to␣
↪convert natural language to SQL.

Args:
table_name (str): The name of the table.
table_schema (list): A list of tuples where each tuple contains␣
↪information about the columns in the table.

Returns:
str: The generated prompt to be used by the LLM.
"""
prompt = f"""You are an expert in writing SQL queries for relational␣
↪databases.

You will be provided with a database schema and a natural

language question, and your task is to generate an accurate SQL query.

The database has a table named '{table_name}' with the following schema:
↪\n\n"""

prompt += "Columns:\n"

for col in table_schema:

column_name = col[1]
column_type = col[2]
prompt += f"- {column_name} ({column_type})\n"

6
prompt += "\nPlease generate a SQL query based on the following natural␣
↪language question. ONLY return the SQL query."

return prompt

table_name = "movies"
schema = get_table_schema(db_name, table_name)
# Generate the prompt
llm_prompt = generate_llm_prompt(table_name, schema)
print(llm_prompt)

You are an expert in writing SQL queries for relational databases.

You will be provided with a database schema and a natural
language question, and your task is to generate an accurate SQL query.

The database has a table named 'movies' with the following schema:

Columns:
- Movie (TEXT)
- LeadStudio (TEXT)
- RottenTomatoes (REAL)
- AudienceScore (REAL)
- Story (TEXT)
- Genre (TEXT)
- TheatersOpenWeek (REAL)
- OpeningWeekend (REAL)
- BOAvgOpenWeekend (REAL)
- DomesticGross (REAL)
- ForeignGross (REAL)
- WorldGross (REAL)
- Budget (REAL)
- Profitability (REAL)
- OpenProfit (REAL)
- Year (INTEGER)

Please generate a SQL query based on the following natural language question.
ONLY return the SQL query.

[26]: def handle_user_question(user_question):

"""
Handles the user's question by first searching the cache, and if there's no␣
↪hit, generating a SQL query and response.

Args:
user_question (str): The user's natural language question.

7
Returns:
list: The response to the user's question.
"""
# Convert the user's question to an embedding
question_embedding = get_embeddings(user_question)

# Step 1: Search cache for similar questions

cache_hit = search_cache(question_embedding)
if cache_hit:
sql_query, response = cache_hit
print(f"Cache hit! SQL Query: {sql_query}")
return response

# Step 2: No hit, go to LLM for SQL generation

print("Cache miss! Generating SQL from LLM...")
sql_query = generate_sql_query(user_question)

# Step 3: Run the SQL query on the database

response = run_sql_query(db_name, sql_query)

# Step 4: Store question, SQL, and response in cache

cache.append((user_question, sql_query, response))
index.add(np.array([question_embedding])) # Add question embedding to␣
↪FAISS index

return response

[27]: def generate_sql_query(question):

table_name = 'movies'
db_name = 'movies_db.db'
table_schema = get_table_schema(db_name, table_name)
llm_prompt = generate_llm_prompt(table_name, table_schema)
user_prompt = """Question: {question}"""
response = completion(
api_key=OPENAI_API_KEY,
model="gpt-4o-mini",
messages=[
{"content": llm_prompt.format(table_name=table_name),"role":␣
↪"system"},

{"content": user_prompt.format(question=question),"role": "user"}],

max_tokens=1000
)
answer = response.choices[0].message.content
display(Markdown(answer))
query = answer.replace("```sql", "").replace("```", "")
query = query.strip()
return query

8
[37]: # question = "total number of movies are made by Warner Bros company in year␣
↪2008?"

# question = "how many movies have RottenTomatoes scores lower than 85?"
question = "how many movies with action genre are in the database"
handle_user_question(question)

Cache miss! Generating SQL from LLM…

SELECT COUNT(*) AS ActionMovieCount
FROM movies
WHERE Genre = 'Action';

[37]: [(166,)]

[38]: cache

[38]: [('total number of movies are made by Warner Bros company in year 2008?',
"SELECT COUNT(*) \nFROM movies \nWHERE LeadStudio = 'Warner Bros' AND Year =
2008;",
[(21,)]),
('how many movies have RottenTomatoes scores greater than 85?',
'SELECT COUNT(*) \nFROM movies \nWHERE RottenTomatoes > 85;',
[(120,)]),
('how many movies have RottenTomatoes scores lower than 85?',
'SELECT COUNT(*) \nFROM movies \nWHERE RottenTomatoes < 85;',
[(782,)]),
('how many movies with action genre are in the database',
"SELECT COUNT(*) AS ActionMovieCount\nFROM movies\nWHERE Genre = 'Action';",
[(166,)])]

[ ]:

Python SQLite
No ratings yet
Python SQLite
7 pages
Ex01-Quick Start
No ratings yet
Ex01-Quick Start
2 pages
SQLite3 Python Module Overview
No ratings yet
SQLite3 Python Module Overview
31 pages
App Assignment 5
No ratings yet
App Assignment 5
7 pages
Module 3 Notes
No ratings yet
Module 3 Notes
45 pages
Docs Python Org 3 Library Sqlite3 HTML
No ratings yet
Docs Python Org 3 Library Sqlite3 HTML
35 pages
Unit-7 Working With Databases
No ratings yet
Unit-7 Working With Databases
43 pages
DHP SQLite Assignment Solved
No ratings yet
DHP SQLite Assignment Solved
10 pages
Python SQLite Database Tutorial
No ratings yet
Python SQLite Database Tutorial
49 pages
Practical Assignment3
No ratings yet
Practical Assignment3
2 pages
Comp Rec Qs
No ratings yet
Comp Rec Qs
4 pages
Unit 4
No ratings yet
Unit 4
9 pages
Pydblite - Documentation
No ratings yet
Pydblite - Documentation
35 pages
Data Visualization Using Pyplot
No ratings yet
Data Visualization Using Pyplot
14 pages
SBL Python LAB Manual by NY Expt. No. 6
No ratings yet
SBL Python LAB Manual by NY Expt. No. 6
5 pages
Lec 16 BB
No ratings yet
Lec 16 BB
24 pages
Unit-7 Part-1 Working With Databases
No ratings yet
Unit-7 Part-1 Working With Databases
43 pages
Unit IV Part1
No ratings yet
Unit IV Part1
5 pages
303database Handling Using Python
No ratings yet
303database Handling Using Python
3 pages
1 Format For Sqlite Commands: Csca20 Worksheet - Databases
No ratings yet
1 Format For Sqlite Commands: Csca20 Worksheet - Databases
11 pages
Python SQLite Tutorial - The Ultimate Guide
No ratings yet
Python SQLite Tutorial - The Ultimate Guide
12 pages
Python 3.6.4 sqlite3 Module Guide
100% (2)
Python 3.6.4 sqlite3 Module Guide
24 pages
M4 Python SQL
No ratings yet
M4 Python SQL
40 pages
SQLAlchemy
No ratings yet
SQLAlchemy
38 pages
Sqlite3 A
No ratings yet
Sqlite3 A
25 pages
SQL and SQLite3 Basics in Python
No ratings yet
SQL and SQLite3 Basics in Python
51 pages
Unit 4 - 2 Tkinter Database App
No ratings yet
Unit 4 - 2 Tkinter Database App
14 pages
Data Base Connectvity in Python
No ratings yet
Data Base Connectvity in Python
19 pages
SQLite3 Python API Guide
No ratings yet
SQLite3 Python API Guide
18 pages
Asdasd
No ratings yet
Asdasd
14 pages
Assignment 15 Utkarsh
No ratings yet
Assignment 15 Utkarsh
12 pages
DEFCON 27 Omer Gull SELECT Code Execution FROM USING SQLite
No ratings yet
DEFCON 27 Omer Gull SELECT Code Execution FROM USING SQLite
75 pages
Assignment 2
No ratings yet
Assignment 2
1 page
Theory Assignments 1 - 4
No ratings yet
Theory Assignments 1 - 4
1 page
Relationship Between Two Tables
No ratings yet
Relationship Between Two Tables
6 pages
Tutorialfor SQliteusing Python
No ratings yet
Tutorialfor SQliteusing Python
8 pages
Importing Data in Python I: Introduction To Relational Databases
No ratings yet
Importing Data in Python I: Introduction To Relational Databases
33 pages
Computer Science (Shreya and Sasikala)
No ratings yet
Computer Science (Shreya and Sasikala)
13 pages
Unit-5 14M
No ratings yet
Unit-5 14M
26 pages
Question Bank
No ratings yet
Question Bank
2 pages
Quetion Bank
No ratings yet
Quetion Bank
2 pages
Dbms Lab El Report
No ratings yet
Dbms Lab El Report
20 pages
Python Database
No ratings yet
Python Database
3 pages
A SQLite Tutorial With Python PDF
100% (2)
A SQLite Tutorial With Python PDF
14 pages
12th STD Computer Science HANDS On PRACTICE Chapter-15
No ratings yet
12th STD Computer Science HANDS On PRACTICE Chapter-15
7 pages
Journal Answersheet
No ratings yet
Journal Answersheet
18 pages
Cse414 HW1
No ratings yet
Cse414 HW1
4 pages
Rishit Sharma S22 108 Python EXP6
No ratings yet
Rishit Sharma S22 108 Python EXP6
1 page
ML PGM
No ratings yet
ML PGM
8 pages
PY Mod 4
No ratings yet
PY Mod 4
20 pages
Excel Data Management to SQLite
No ratings yet
Excel Data Management to SQLite
3 pages
25 SQL Connectivity
No ratings yet
25 SQL Connectivity
8 pages
DHP Journal
No ratings yet
DHP Journal
2 pages
Activity - 15 Using Databases
No ratings yet
Activity - 15 Using Databases
5 pages
Python Details PDF
No ratings yet
Python Details PDF
3 pages
SQL With Python Guide
No ratings yet
SQL With Python Guide
17 pages
Library System Tutorial
No ratings yet
Library System Tutorial
3 pages
Regularization For Neural Networks 1718966083
No ratings yet
Regularization For Neural Networks 1718966083
9 pages
RBAC Configuration Management v1 1723989259
No ratings yet
RBAC Configuration Management v1 1723989259
4 pages
AI Basics for Kids: Part 2
No ratings yet
AI Basics for Kids: Part 2
12 pages
Snowflake Cortex for Data Analysts
No ratings yet
Snowflake Cortex for Data Analysts
9 pages
AI Algorithms Explained To Kids 1717055132
No ratings yet
AI Algorithms Explained To Kids 1717055132
10 pages
OpenAI O1 Technical Summary and Examples
No ratings yet
OpenAI O1 Technical Summary and Examples
20 pages
Typeof Chunking
No ratings yet
Typeof Chunking
11 pages
Types of Agents
No ratings yet
Types of Agents
16 pages
Vector Databases
100% (1)
Vector Databases
35 pages
Wavelets Meet Large Language Models
No ratings yet
Wavelets Meet Large Language Models
16 pages
SF Dataloading Commands
No ratings yet
SF Dataloading Commands
4 pages
20 Types Prompting Styles
100% (1)
20 Types Prompting Styles
22 pages
How LLMs Collaborate With Multi Agent Setup
No ratings yet
How LLMs Collaborate With Multi Agent Setup
6 pages
Descriptive Stats
No ratings yet
Descriptive Stats
83 pages
Digital Radiographic Quality Control Guide
No ratings yet
Digital Radiographic Quality Control Guide
150 pages
Yuzu Debug Log Analysis
No ratings yet
Yuzu Debug Log Analysis
1,034 pages
Si5351A VFO Kit for DSB Transceivers
No ratings yet
Si5351A VFO Kit for DSB Transceivers
2 pages
LAN Chat Application 25pages
No ratings yet
LAN Chat Application 25pages
26 pages
Purposive Communication - PDF - Nonverbal Communication - Communication
No ratings yet
Purposive Communication - PDF - Nonverbal Communication - Communication
125 pages
Ubiquitous Computing Systems: Reading 1: Weiser, M. 1991. The Computer For The 21St Century
No ratings yet
Ubiquitous Computing Systems: Reading 1: Weiser, M. 1991. The Computer For The 21St Century
10 pages
Temenja Yazh Secondary School
No ratings yet
Temenja Yazh Secondary School
15 pages
SD-WAN and Routing Subscription FAQ
No ratings yet
SD-WAN and Routing Subscription FAQ
22 pages
Jismo P3
No ratings yet
Jismo P3
33 pages
Create A Path To Nowhere (All Current and Upcoming Sinners 2025) Tier List - TierMaker
No ratings yet
Create A Path To Nowhere (All Current and Upcoming Sinners 2025) Tier List - TierMaker
1 page
Microprocessors and Microcontrollers 56
No ratings yet
Microprocessors and Microcontrollers 56
1 page
Database Basics for Beginners
No ratings yet
Database Basics for Beginners
4 pages
Tutorial 3 Answers Part 1
No ratings yet
Tutorial 3 Answers Part 1
16 pages
Essential Windows Keyboard Shortcuts
No ratings yet
Essential Windows Keyboard Shortcuts
14 pages
Erapid's Torrent Uploads on TPB & KAT
No ratings yet
Erapid's Torrent Uploads on TPB & KAT
1 page
Comparison Sheet
No ratings yet
Comparison Sheet
19 pages
Resume Upload Testing Guide
No ratings yet
Resume Upload Testing Guide
5 pages
Lean
No ratings yet
Lean
37 pages
WWW Studocu Com in N 21116842 Sid 338668551700029036
No ratings yet
WWW Studocu Com in N 21116842 Sid 338668551700029036
1 page
A New Perspective On Cloud Computing
No ratings yet
A New Perspective On Cloud Computing
10 pages
Compiler Design: Type Checking Guide
No ratings yet
Compiler Design: Type Checking Guide
39 pages
Customer Support Contacts
No ratings yet
Customer Support Contacts
2 pages
Mcafee Agent 5.7.x Product Guide
No ratings yet
Mcafee Agent 5.7.x Product Guide
75 pages
GE OEC Fluorostar Compact / Series: Mobile Digital C-Arm Service Manual
100% (2)
GE OEC Fluorostar Compact / Series: Mobile Digital C-Arm Service Manual
484 pages
Yield Based Process Capability Indices For Nonnormal Continuous Data - JQT - 2019
No ratings yet
Yield Based Process Capability Indices For Nonnormal Continuous Data - JQT - 2019
11 pages
Offiwiz File
No ratings yet
Offiwiz File
20 pages
ISA CSE Study Guide - 4th Edition
88% (8)
ISA CSE Study Guide - 4th Edition
116 pages
Memory Hierarchy Presentation Detailed
No ratings yet
Memory Hierarchy Presentation Detailed
24 pages
Data Structure Q&A for C Programming
No ratings yet
Data Structure Q&A for C Programming
32 pages
Excel Budget Template: Project Start Date Scroll To Week #
No ratings yet
Excel Budget Template: Project Start Date Scroll To Week #
5 pages

Text2SQL Agent for CSV Databases

Uploaded by

Text2SQL Agent for CSV Databases

Uploaded by

14_text2sql_agent

1 Text2SQL Agent to Interact with CSV Data

1.2 Data Ingestion Pipeline

[1]: import pandas as pd

def csv_to_sqlite(csv_file, db_name, table_name):

# Create the table schema

# Execute the table creation query

# Create table schema

# Insert CSV data into the SQLite table

# Commit and close the connection

Table 'movies' created with schema: "Movie" TEXT, "LeadStudio" TEXT,

[2]: def run_sql_query(db_name, query):

# Execute the SQL query

# Fetch all results

# Close the connection

# Return results or an empty list if no results were found

[3]: query = f"SELECT count(*) FROM {table_name};"

1.3 Ask Natural Language Questions

[5]: OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

# Initialize the FAISS index

# Cache will hold (user_question, sql_query, response)

[16]: def get_table_schema(db_name, table_name):

# Use PRAGMA to get the table schema

Schema for movies:

[25]: def generate_llm_prompt(table_name, table_schema):

You will be provided with a database schema and a natural

for col in table_schema:

You are an expert in writing SQL queries for relational databases.

[26]: def handle_user_question(user_question):

# Step 1: Search cache for similar questions

# Step 2: No hit, go to LLM for SQL generation

# Step 3: Run the SQL query on the database

# Step 4: Store question, SQL, and response in cache

[27]: def generate_sql_query(question):

{"content": user_prompt.format(question=question),"role": "user"}],

Cache miss! Generating SQL from LLM…

You might also like