import random
# Charger les données depuis un fichier CSV
def load_csv(file_path):
with open(file_path, 'r') as file:
lines = [Link]()
header = lines[0].strip().split(',')
data = []
for line in lines[1:]:
values = [Link]().split(',')
[Link](values)
return header, data
# Encoder les données catégorielles en valeurs numériques
def encode_data(data, header):
unique_values = {column: list(set([row[i] for row in data])) for i, column in
enumerate(header)}
encoded_data = []
for row in data:
encoded_row = []
for i, value in enumerate(row):
encoded_row.append(unique_values[header[i]].index(value))
encoded_data.append(encoded_row)
return encoded_data, unique_values
# Scinder le jeu de données en deux parties (80% apprentissage, 20% test)
def train_test_split(data, test_size=0.2):
[Link](data)
split_index = int(len(data) * (1 - test_size))
train_data = data[:split_index]
test_data = data[split_index:]
return train_data, test_data
# Calculer le centroïde pour chaque classe
def calculate_centroids(data, labels, num_classes):
centroids = [[0] * len(data[0]) for _ in range(num_classes)]
counts = [0] * num_classes
for i, row in enumerate(data):
label = labels[i]
for j in range(len(row)):
centroids[label][j] += row[j]
counts[label] += 1
for i in range(num_classes):
centroids[i] = [x / counts[i] if counts[i] != 0 else 0 for x in
centroids[i]]
return centroids
# Prédire la classe en fonction du centroïde le plus proche
def predict_centroid(x, centroids):
min_dist = float('inf')
best_class = -1
for i, centroid in enumerate(centroids):
dist = sum((x[j] - centroid[j]) ** 2 for j in range(len(x)))
if dist < min_dist:
min_dist = dist
best_class = i
return best_class
# Évaluer la précision du modèle
def accuracy_score(y_true, y_pred):
correct = sum(1 for i in range(len(y_true)) if y_true[i] == y_pred[i])
return correct / len(y_true)
# Charger les données
header, data = load_csv('/mnt/data/Student_performance_data.csv')
# Encoder les données
encoded_data, unique_values = encode_data(data, header)
# Diviser les données en X (caractéristiques) et y (label)
X = [row[:-1] for row in encoded_data]
y = [row[-1] for row in encoded_data]
# Scinder les données en ensembles d'apprentissage et de test
train_data, test_data = train_test_split(encoded_data)
X_train = [row[:-1] for row in train_data]
y_train = [row[-1] for row in train_data]
X_test = [row[:-1] for row in test_data]
y_test = [row[-1] for row in test_data]
# Calculer les centroïdes pour chaque classe
num_classes = len(set(y_train))
centroids = calculate_centroids(X_train, y_train, num_classes)
# Prédire les classes pour l'ensemble de test
y_pred = [predict_centroid(x, centroids) for x in X_test]
# Évaluer la précision du modèle
accuracy = accuracy_score(y_test, y_pred)
print(f"Précision du modèle : {accuracy:.2f}")
# Sauvegarder les données encodées
with open('/mnt/data/PremierNom_DernierNom_Student_performance.csv', 'w') as file:
[Link](','.join(header) + '\n')
for row in encoded_data:
[Link](','.join(map(str, row)) + '\n')