Skip to content

Commit 924c383

Browse files
PR295 : Database Example (dotnet#405)
* This provides an example of integrating a database within ML.Net. The sample provides a solution that integrates with the existing sample solution, along with instructions for how to create the database and run the example. * - Updating based upon feedback. * - Addressing feedback, updating the README.md. * - Another round of updates. * - Addressing the ToList feedback. * Removed solution folder. will get it from amster when merging
1 parent bd5f36c commit 924c383

File tree

5 files changed

+244
-0
lines changed

5 files changed

+244
-0
lines changed
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
2+
Microsoft Visual Studio Solution File, Format Version 12.00
3+
# Visual Studio 15
4+
VisualStudioVersion = 15.0.28010.2041
5+
MinimumVisualStudioVersion = 10.0.40219.1
6+
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "DatabaseIntegration", "DatabaseIntegration\DatabaseIntegration.csproj", "{E0EA9351-CD82-485F-94DD-5EA500814943}"
7+
EndProject
8+
Global
9+
GlobalSection(SolutionConfigurationPlatforms) = preSolution
10+
Debug|Any CPU = Debug|Any CPU
11+
Release|Any CPU = Release|Any CPU
12+
EndGlobalSection
13+
GlobalSection(ProjectConfigurationPlatforms) = postSolution
14+
{E0EA9351-CD82-485F-94DD-5EA500814943}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
15+
{E0EA9351-CD82-485F-94DD-5EA500814943}.Debug|Any CPU.Build.0 = Debug|Any CPU
16+
{E0EA9351-CD82-485F-94DD-5EA500814943}.Release|Any CPU.ActiveCfg = Release|Any CPU
17+
{E0EA9351-CD82-485F-94DD-5EA500814943}.Release|Any CPU.Build.0 = Release|Any CPU
18+
EndGlobalSection
19+
GlobalSection(SolutionProperties) = preSolution
20+
HideSolutionNode = FALSE
21+
EndGlobalSection
22+
GlobalSection(ExtensibilityGlobals) = postSolution
23+
SolutionGuid = {5B3ACA76-DA8A-40E8-A097-B51EF1BB9D12}
24+
EndGlobalSection
25+
EndGlobal
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
<Project Sdk="Microsoft.NET.Sdk">
2+
3+
<PropertyGroup>
4+
<OutputType>Exe</OutputType>
5+
<TargetFramework>netcoreapp2.1</TargetFramework>
6+
</PropertyGroup>
7+
8+
<ItemGroup>
9+
<PackageReference Include="Microsoft.EntityFrameworkCore.Design" Version="3.0.0-preview.19074.3" />
10+
<PackageReference Include="Microsoft.EntityFrameworkCore.Sqlite" Version="3.0.0-preview.19074.3" />
11+
<PackageReference Include="Microsoft.ML" Version="1.0.0-preview" />
12+
<PackageReference Include="Microsoft.ML.LightGBM" Version="1.0.0-preview" />
13+
<PackageReference Include="Microsoft.ML.FastTree" Version="1.0.0-preview" />
14+
</ItemGroup>
15+
<ItemGroup>
16+
<Folder Include="Common\" />
17+
</ItemGroup>
18+
<ItemGroup>
19+
<Compile Include="..\..\..\common\ConsoleHelper.cs" Link="Common\ConsoleHelper.cs" />
20+
</ItemGroup>
21+
22+
23+
</Project>
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
using Microsoft.EntityFrameworkCore;
2+
using System.Collections.Generic;
3+
using System.ComponentModel.DataAnnotations;
4+
using System.ComponentModel.DataAnnotations.Schema;
5+
6+
namespace DatabaseIntegration
7+
{
8+
public class AdultCensusContext : DbContext
9+
{
10+
public DbSet<AdultCensus> AdultCensus { get; set; }
11+
12+
protected override void OnConfiguring(DbContextOptionsBuilder optionsBuilder)
13+
{
14+
optionsBuilder.UseSqlite("Data Source=mlexample.db");
15+
}
16+
}
17+
public class AdultCensus
18+
{
19+
[Key]
20+
[DatabaseGenerated(DatabaseGeneratedOption.Identity)]
21+
public int AdultCensusId {get; set;}
22+
public int Age { get; set; }
23+
public string Workclass { get; set; }
24+
public string Education { get; set; }
25+
public string MaritalStatus { get; set; }
26+
public string Occupation { get; set; }
27+
public string Relationship { get; set; }
28+
public string Race { get; set; }
29+
public string Sex { get; set; }
30+
public string CapitalGain { get; set; }
31+
public string CapitalLoss { get; set; }
32+
public int HoursPerWeek { get; set; }
33+
public string NativeCountry { get; set; }
34+
public bool Label { get; set; }
35+
}
36+
}
Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
using Common;
2+
using Microsoft.EntityFrameworkCore;
3+
using Microsoft.ML;
4+
using Microsoft.ML.Transforms;
5+
using Microsoft.ML.Trainers;
6+
using System;
7+
using System.IO;
8+
using System.Linq;
9+
using System.Net;
10+
using System.Collections.Generic;
11+
using System.Threading.Tasks;
12+
13+
namespace DatabaseIntegration
14+
{
15+
public class Program
16+
{
17+
// The url for the dataset that will be downloaded
18+
public static string datasetUrl = "https://raw.githubusercontent.com/dotnet/machinelearning/244a8c2ac832657af282aa312d568211698790aa/test/data/adult.train";
19+
20+
public static IEnumerable<string> ReadRemoteDataset(string url)
21+
{
22+
using(var client = new WebClient())
23+
using(var stream = client.OpenRead(url))
24+
using(var reader = new StreamReader(stream))
25+
{
26+
string line;
27+
while ((line = reader.ReadLine()) != null)
28+
{
29+
yield return line;
30+
}
31+
}
32+
}
33+
34+
/// <summary>
35+
/// Wrapper function that performs the database query and returns an IEnumerable, creating
36+
/// a database context each time.
37+
/// </summary>
38+
/// <remarks>
39+
/// ML.Net can traverse an IEnumerable with multiple threads. This will result in Entity Core Framwork throwing an exception
40+
/// as multiple threads cannot access the same database context. To work around this, create a database context
41+
/// each time a IEnumerable is requested.
42+
/// </remarks>
43+
/// <returns>An IEnumerable of the resulting data.</returns>
44+
private static IEnumerable<AdultCensus> QueryData()
45+
{
46+
using (var db = new AdultCensusContext())
47+
{
48+
// Query our training data from the database. This query is selecting everything from the AdultCensus table. The
49+
// result is then loaded by ML.Net through the LoadFromEnumerable. LoadFromEnumerable returns an IDataView which
50+
// can be consumed by an ML.Net pipeline.
51+
// NOTE: For training, ML.Net requires that the training data is processed in the same order to produce consistent results.
52+
// Therefore we are sorting the data by the AdultCensusId, which is an auto-generated id.
53+
// NOTE: That the query used here sets the query tracking behavior to be NoTracking, this is particularly useful because
54+
// our scenarios only require read-only access.
55+
foreach (var adult in db.AdultCensus.AsNoTracking().OrderBy(x => x.AdultCensusId))
56+
{
57+
yield return adult;
58+
}
59+
}
60+
}
61+
62+
/// <summary>
63+
/// Populates the database with the specified dataset url.
64+
/// </summary>
65+
public static void CreateDatabase(string url)
66+
{
67+
var dataset = ReadRemoteDataset(url);
68+
using (var db = new AdultCensusContext())
69+
{
70+
// Ensure that we have a clean database to start with.
71+
db.Database.EnsureDeleted();
72+
db.Database.EnsureCreated();
73+
Console.WriteLine($"Database created, populating...");
74+
75+
// Parse the dataset.
76+
var data = dataset
77+
.Skip(1) // Skip the header row
78+
.Select(l => l.Split(','))
79+
.Where(row => row.Length > 1)
80+
.Select(row => new AdultCensus()
81+
{
82+
Age = int.Parse(row[0]),
83+
Workclass = row[1],
84+
Education = row[3],
85+
MaritalStatus = row[5],
86+
Occupation = row[6],
87+
Relationship = row[7],
88+
Race = row[8],
89+
Sex = row[9],
90+
CapitalGain = row[10],
91+
CapitalLoss = row[11],
92+
HoursPerWeek = int.Parse(row[12]),
93+
NativeCountry = row[13],
94+
Label = (int.Parse(row[14]) == 1) ? true : false
95+
});
96+
97+
// Add the data into the database
98+
db.AdultCensus.AddRange(data);
99+
100+
var count = db.SaveChanges();
101+
Console.WriteLine($"Total count of items saved to database: {count}");
102+
}
103+
}
104+
105+
public static void Main()
106+
{
107+
// Seed the database with the dataset.
108+
CreateDatabase(datasetUrl);
109+
var mlContext = new MLContext(seed: 1);
110+
111+
/// Query the data from the database, please see <see cref="QueryData"/> for more information.
112+
var dataView = mlContext.Data.LoadFromEnumerable(QueryData());
113+
/// Creates the training and testing data sets.
114+
var trainTestData = mlContext.Data.TrainTestSplit(dataView);
115+
116+
var pipeline = mlContext.Transforms.Categorical.OneHotEncoding(new[] {
117+
new InputOutputColumnPair("MsOHE", "MaritalStatus"),
118+
new InputOutputColumnPair("OccOHE", "Occupation"),
119+
new InputOutputColumnPair("RelOHE", "Relationship"),
120+
new InputOutputColumnPair("SOHE", "Sex"),
121+
new InputOutputColumnPair("NatOHE", "NativeCountry")
122+
}, OneHotEncodingEstimator.OutputKind.Binary)
123+
.Append(mlContext.Transforms.Concatenate("Features", "MsOHE", "OccOHE", "RelOHE", "SOHE", "NatOHE"))
124+
.Append(mlContext.BinaryClassification.Trainers.LightGbm());
125+
126+
Console.WriteLine("Training model...");
127+
var model = pipeline.Fit(trainTestData.TrainSet);
128+
129+
Console.WriteLine("Predicting...");
130+
131+
// Now that the model is trained, we want to test it's prediction results, which is done by using a test dataset
132+
var predictions = model.Transform(trainTestData.TestSet);
133+
134+
// Now that we have the predictions, calculate the metrics of those predictions and output the results.
135+
var metrics = mlContext.BinaryClassification.Evaluate(predictions);
136+
ConsoleHelper.PrintBinaryClassificationMetrics("Database Example", metrics);
137+
ConsoleHelper.ConsolePressAnyKey();
138+
}
139+
}
140+
}
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# Using a database as a data source
2+
This sample demonstrates how to use a database as a data source for an ML.Net pipeline. As ML.Net does not have native support for a database, this sample shows how data can be accessed using an IEnumerable. Since the database is treated as any other datasource, it is possible to query the database and use the resulting data for training and prediction scenarios.
3+
4+
## Problem
5+
Enterprise users have a need to use their existing data set that is in their company's database to train and predict with ML.Net. They need support to leverage their existing relational table schema, ability to read from the database directly, and to be aware of memory limitations as the data is being consumed.
6+
7+
## Solution
8+
This sample shows how to use the Entity Framework Core to connect to a database, query and feed the resulting data into an ML.Net pipeline.
9+
10+
This sample uses SQLite to help demonstrate the database integration, but any database that is supported by the Entity Framwork Core can be used. As ML.Net can create an IDataView from an IEnumerable, this sample will use the IEnumerable that is returned from a query to feed the data into the ML.Net pipeline. To prevent the Entity Framework Core from loading all the data in from a result, a no tracking query is used.
11+
12+
The sample will do the following:
13+
- Downloads a sample dataset
14+
- Creates and populates the database
15+
- Query database for the dataset
16+
- Converts the IEnumerable to IDataView
17+
- Trains a LightGBM Binary Classification model
18+
- Queries the database for a test dataset
19+
- Runs predictions
20+
- Evaluates the prediction metrics

0 commit comments

Comments
 (0)