|
internal static IList<string> GenerateClassLabels(ColumnInferenceResults columnInferenceResults, IDictionary<string, CodeGeneratorSettings.ColumnMapping> columnMapping = default) |
|
{ |
|
IList<string> result = new List<string>(); |
|
foreach (var column in columnInferenceResults.TextLoaderOptions.Columns) |
|
{ |
|
StringBuilder sb = new StringBuilder(); |
|
int range = (column.Source[0].Max - column.Source[0].Min).Value; |
|
bool isArray = range > 0; |
|
sb.Append(Symbols.PublicSymbol); |
|
sb.Append(Symbols.Space); |
|
|
|
// if column is in columnMapping, use the type and name in that |
|
DataKind dataKind; |
|
string columnName; |
|
|
|
if (columnMapping != null && columnMapping.ContainsKey(column.Name)) |
|
{ |
|
dataKind = columnMapping[column.Name].ColumnType; |
|
columnName = columnMapping[column.Name].ColumnName; |
|
} |
|
else |
|
{ |
|
dataKind = column.DataKind; |
|
columnName = column.Name; |
|
} |
|
switch (dataKind) |
|
{ |
|
case Microsoft.ML.Data.DataKind.String: |
|
sb.Append(Symbols.StringSymbol); |
|
break; |
|
case Microsoft.ML.Data.DataKind.Boolean: |
|
sb.Append(Symbols.BoolSymbol); |
|
break; |
|
case Microsoft.ML.Data.DataKind.Single: |
|
sb.Append(Symbols.FloatSymbol); |
|
break; |
|
case Microsoft.ML.Data.DataKind.Double: |
|
sb.Append(Symbols.DoubleSymbol); |
|
break; |
|
case Microsoft.ML.Data.DataKind.Int32: |
|
sb.Append(Symbols.IntSymbol); |
|
break; |
|
case Microsoft.ML.Data.DataKind.UInt32: |
|
sb.Append(Symbols.UIntSymbol); |
|
break; |
|
case Microsoft.ML.Data.DataKind.Int64: |
|
sb.Append(Symbols.LongSymbol); |
|
break; |
|
case Microsoft.ML.Data.DataKind.UInt64: |
|
sb.Append(Symbols.UlongSymbol); |
|
break; |
|
default: |
|
throw new ArgumentException($"The data type '{column.DataKind}' is not handled currently."); |
|
|
|
} |
|
|
|
if (range > 0) |
|
{ |
|
result.Add($"[ColumnName(\"{columnName}\"),LoadColumn({column.Source[0].Min}, {column.Source[0].Max}) VectorType({(range + 1)})]"); |
|
sb.Append("[]"); |
|
} |
|
else |
|
{ |
|
result.Add($"[ColumnName(\"{columnName}\"), LoadColumn({column.Source[0].Min})]"); |
|
} |
|
sb.Append(" "); |
|
sb.Append(Utils.Normalize(column.Name)); |
|
sb.Append("{get; set;}"); |
|
result.Add(sb.ToString()); |
|
result.Add("\r\n"); |
|
} |
|
return result; |
|
} |
As @LittleLittleCloud noted in this comment in PR #5177 for fixing Issue #3902, columns generated from inline data are currently named in the following way:
machinelearning/src/Microsoft.ML.CodeGenerator/Utils.cs
Lines 49 to 68 in 33f5f32
This method of directly using
Utils.Normalizeis different from usingGenerateClassLabelsinstead to obtain normalized and sanitized column names.GenerateClassLabelscan accommodate conflicting/duplicate column names, whereas inGenerateSampleData()this situation results in exceptions.machinelearning/src/Microsoft.ML.CodeGenerator/Utils.cs
Lines 246 to 318 in 33f5f32
To-do:
GenerateSampleData()can accomodate conflicting/duplicate column names by usingUtils.GenerateClassLabels().