Skip to content

Commit 4b72e2e

Browse files
prathyusha12345CESARDELATORRE
authored andcommitted
Create sample on Large datasets (dotnet#476)
Merging PR * Added code to classify github issues into the best 3 labels. 1.Created a class to hold Prediction values 2.Added Score filed in GitHubIssuePrediction class. 3.Changed the existing code in Labeler class. 4.Added new method to find the best 3 scores and return their indexes so that we can get the labels of those matched indexes from slotnames. * Changed the images * Changed Names * changed names * Removed images * Added images * removed seed parameter during prediction * WIP: pushed sample for object detection * Removing object detection solution from master branch * Added a new sample i.e Scalable WEBAPI for Real time scenario in the repo. so added link for that sample in ReadME file in seperate table. * Added code to retrieve labels from scores. Added comments how the scores are mapped to labels. * updated ReadMe file according to upstream master * Create dummy.html * Added third party license for content in Customer segmentation sample file i.e datahelper.cs * Added third party notice file * Added third party notices text file * Added third party notices text file * Added third party notices for Credit Card Fraud Detection sample. * Changed the license information according pedigree scan results. * added license info according to predigree scn results. * Added license file in data folder as well as Readme file. * changed info in license file * Added license file for HTML5 * Added License file under name Nicolas Gallagher and Jonathan Neal * Added license file for jquesry CSS transition library where the code contains // http://blog.alexmaccaw.com/css-transitions * Added License for files having statement "// Source: http://nicolasgallagher.com/micro-clearfix-hack/" * Added License file for CVE where we have refereed CVE entries * Added license for num2fraction * Added license file for Java Script Undo manager * Added license files for different copyright code * Product Recommendation: Added citation in datafolder. Added citation link in ReadMe file * Added license file * No need of this file * Minor change * Changed info in license file according to pedigree scan results * Minor change fixes issue during pedigree scan * Minor change to fix issues during pedigree scan * Minor change to fix issue during pedigree scan * Revert "Minor change to fix issue during pedigree scan" This reverts commit 52ebb67. * Minor change to fix during pedigree scan * Minor change to fix issue during pedigree scan * Added citation for datasets in data folder * Changed the license file of tensor flow. * Changed the license file * Added datasets-citation file in data folder * Added dataset-citation file in the data folder of MNIST. Added link in ReadMe file as well * Added third party notice * Added third party notice * Added license file * minor changes * removed Heart disease detection as build is not runnign locally.will add it while merging upstream * minor change * Added Heart Disease sample again * Added URL Classifer sample that deals with large datasets with multilple columns and number of dataset files * Added UrlClassifierr proj to solution file * added folders according to existing structure * Downloading the file and transforming data to be compatiable with ML.Net API. * Refactoring the code. * Minor fix * Minor changes. * Minor changes * Refactoring * Changed project name to Large Datasets. Changed ReadMe file to add more details on how to deal with large datasets. * Minor changes * Minor changes * Added solution folder for LargeDatasets sample.
1 parent 78b99b2 commit 4b72e2e

File tree

9 files changed

+720
-0
lines changed

9 files changed

+720
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
2+
Microsoft Visual Studio Solution File, Format Version 12.00
3+
# Visual Studio Version 16
4+
VisualStudioVersion = 16.0.28803.452
5+
MinimumVisualStudioVersion = 10.0.40219.1
6+
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "LargeDatasets", "LargeDatasets\LargeDatasets.csproj", "{3EF000C4-725B-4FF0-B6C9-60F3D6EB877E}"
7+
EndProject
8+
Global
9+
GlobalSection(SolutionConfigurationPlatforms) = preSolution
10+
Debug|Any CPU = Debug|Any CPU
11+
Release|Any CPU = Release|Any CPU
12+
EndGlobalSection
13+
GlobalSection(ProjectConfigurationPlatforms) = postSolution
14+
{3EF000C4-725B-4FF0-B6C9-60F3D6EB877E}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
15+
{3EF000C4-725B-4FF0-B6C9-60F3D6EB877E}.Debug|Any CPU.Build.0 = Debug|Any CPU
16+
{3EF000C4-725B-4FF0-B6C9-60F3D6EB877E}.Release|Any CPU.ActiveCfg = Release|Any CPU
17+
{3EF000C4-725B-4FF0-B6C9-60F3D6EB877E}.Release|Any CPU.Build.0 = Release|Any CPU
18+
EndGlobalSection
19+
GlobalSection(SolutionProperties) = preSolution
20+
HideSolutionNode = FALSE
21+
EndGlobalSection
22+
GlobalSection(ExtensibilityGlobals) = postSolution
23+
SolutionGuid = {33DE4C5F-466D-4A62-8343-1FDC688699AA}
24+
EndGlobalSection
25+
EndGlobal

samples/csharp/getting-started/LargeDatasets/LargeDatasets/Common/ConsoleHelper.cs

+290
Large diffs are not rendered by default.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
using Microsoft.ML.Data;
2+
3+
namespace LargeDatasets.DataStructures
4+
{
5+
public class UrlData
6+
{
7+
[LoadColumn(0)]
8+
public string LabelColumn;
9+
10+
[LoadColumn(1, 3231961)]
11+
[VectorType(3231961)]
12+
public float[] FeatureVector;
13+
}
14+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
using Microsoft.ML.Data;
2+
3+
namespace LargeDatasets.DataStructures
4+
{
5+
public class UrlPrediction
6+
{
7+
// ColumnName attribute is used to change the column name from
8+
// its default value, which is the name of the field.
9+
[ColumnName("PredictedLabel")]
10+
public bool Prediction;
11+
12+
public float Score;
13+
}
14+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
<Project Sdk="Microsoft.NET.Sdk">
2+
3+
<PropertyGroup>
4+
<OutputType>Exe</OutputType>
5+
<TargetFramework>netcoreapp2.2</TargetFramework>
6+
</PropertyGroup>
7+
8+
<ItemGroup>
9+
<Compile Remove="Data\train\**" />
10+
<EmbeddedResource Remove="Data\train\**" />
11+
<None Remove="Data\train\**" />
12+
</ItemGroup>
13+
14+
<ItemGroup>
15+
<PackageReference Include="Microsoft.ML" Version="$(MicrosoftMLVersion)" />
16+
<PackageReference Include="SharpZipLib" Version="1.1.0" />
17+
</ItemGroup>
18+
19+
</Project>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
using Common;
2+
using ICSharpCode.SharpZipLib.GZip;
3+
using ICSharpCode.SharpZipLib.Tar;
4+
using Microsoft.ML;
5+
using System;
6+
using System.Collections.Generic;
7+
using System.IO;
8+
using System.Linq;
9+
using System.Net;
10+
using LargeDatasets.DataStructures;
11+
using static Microsoft.ML.DataOperationsCatalog;
12+
13+
namespace LargeDatasets
14+
{
15+
class Program
16+
{
17+
static string originalDataDirectoryRelativePath = @"../../../Data/OriginalUrlData";
18+
static string originalDataReltivePath = @"../../../Data/OriginalUrlData/url_svmlight";
19+
static string preparedDataReltivePath = @"../../../Data/PreparedUrlData/url_svmlight";
20+
21+
static string originalDataDirectoryPath = GetAbsolutePath(originalDataDirectoryRelativePath);
22+
static string originalDataPath = GetAbsolutePath(originalDataReltivePath);
23+
static string preparedDataPath = GetAbsolutePath(preparedDataReltivePath);
24+
static void Main(string[] args)
25+
{
26+
//STEP 1: Download dataset
27+
DownloadDataset(originalDataDirectoryPath);
28+
29+
//Step 2: Prepare data by adding second column with value total number of features.
30+
PrepareDataset(originalDataPath, preparedDataPath);
31+
32+
MLContext mlContext = new MLContext();
33+
34+
//STEP 3: Common data loading configuration
35+
var fullDataView = mlContext.Data.LoadFromTextFile<UrlData>(path: Path.Combine(preparedDataPath, "*"),
36+
hasHeader: false,
37+
allowSparse: true);
38+
39+
//Step 4: Divide the whole dataset into 80% training and 20% testing data.
40+
TrainTestData trainTestData = mlContext.Data.TrainTestSplit(fullDataView, testFraction: 0.2, seed: 1);
41+
IDataView trainDataView = trainTestData.TrainSet;
42+
IDataView testDataView = trainTestData.TestSet;
43+
44+
//Step 5: Map label value from string to bool
45+
var UrlLabelMap = new Dictionary<string, bool>();
46+
UrlLabelMap["+1"] = true; //Malicious url
47+
UrlLabelMap["-1"] = false; //Benign
48+
var dataProcessingPipeLine = mlContext.Transforms.Conversion.MapValue("LabelKey", UrlLabelMap, "LabelColumn");
49+
ConsoleHelper.PeekDataViewInConsole(mlContext, trainDataView, dataProcessingPipeLine, 2);
50+
51+
//Step 6: Append trainer to pipeline
52+
var trainingPipeLine = dataProcessingPipeLine.Append(
53+
mlContext.BinaryClassification.Trainers.FieldAwareFactorizationMachine(labelColumnName: "LabelKey", featureColumnName: "FeatureVector"));
54+
55+
//Step 7: Train the model
56+
Console.WriteLine("====Training the model=====");
57+
var mlModel = trainingPipeLine.Fit(trainDataView);
58+
Console.WriteLine("====Completed Training the model=====");
59+
Console.WriteLine("");
60+
61+
//Step 8: Evaluate the model
62+
Console.WriteLine("====Evaluating the model=====");
63+
var predictions = mlModel.Transform(testDataView);
64+
var metrics = mlContext.BinaryClassification.Evaluate(data: predictions, labelColumnName: "LabelKey", scoreColumnName: "Score");
65+
ConsoleHelper.PrintBinaryClassificationMetrics(mlModel.ToString(),metrics);
66+
67+
// Try a single prediction
68+
Console.WriteLine("====Predicting sample data=====");
69+
var predEngine = mlContext.Model.CreatePredictionEngine<UrlData, UrlPrediction>(mlModel);
70+
// Create sample data to do a single prediction with it
71+
var sampleDatas = CreateSingleDataSample(mlContext, trainDataView);
72+
foreach (var sampleData in sampleDatas)
73+
{
74+
UrlPrediction predictionResult = predEngine.Predict(sampleData);
75+
Console.WriteLine($"Single Prediction --> Actual value: {sampleData.LabelColumn} | Predicted value: {predictionResult.Prediction}");
76+
}
77+
Console.WriteLine("====End of Process..Press any key to exit====");
78+
Console.ReadLine();
79+
}
80+
81+
public static void DownloadDataset(string originalDataDirectoryPath)
82+
{
83+
if (!Directory.Exists(originalDataDirectoryPath))
84+
{
85+
Console.WriteLine("====Downloading and extracting data====");
86+
using (var client = new WebClient())
87+
{
88+
//The code below will download a dataset from a third-party, UCI (link), and may be governed by separate third-party terms.
89+
//By proceeding, you agree to those separate terms.
90+
client.DownloadFile("https://archive.ics.uci.edu/ml/machine-learning-databases/url/url_svmlight.tar.gz", "url_svmlight.zip");
91+
}
92+
93+
Stream inputStream = File.OpenRead("url_svmlight.zip");
94+
Stream gzipStream = new GZipInputStream(inputStream);
95+
TarArchive tarArchive = TarArchive.CreateInputTarArchive(gzipStream);
96+
tarArchive.ExtractContents(originalDataDirectoryPath);
97+
98+
tarArchive.Close();
99+
gzipStream.Close();
100+
inputStream.Close();
101+
Console.WriteLine("====Downloading and extracting is completed====");
102+
}
103+
}
104+
105+
private static void PrepareDataset(string originalDataPath,string preparedDataPath)
106+
{
107+
//Create folder for prepared Data path if it does not exist.
108+
if (!Directory.Exists(preparedDataPath))
109+
{
110+
Directory.CreateDirectory(preparedDataPath);
111+
}
112+
Console.WriteLine("====Preparing Data====");
113+
Console.WriteLine("");
114+
//ML.Net API checks for number of features column before the sparse matrix format
115+
//So add total number of features i.e 3231961 as second column by taking all the files from originalDataPath
116+
//and save those files in preparedDataPath.
117+
if (Directory.GetFiles(preparedDataPath).Length == 0)
118+
{
119+
var ext = new List<string> { ".svm" };
120+
var filesInDirectory = Directory.GetFiles(originalDataPath, "*.*", SearchOption.AllDirectories)
121+
.Where(s => ext.Contains(Path.GetExtension(s)));
122+
foreach (var file in filesInDirectory)
123+
{
124+
AddFeaturesColumn(Path.GetFullPath(file), preparedDataPath);
125+
}
126+
}
127+
Console.WriteLine("====Data Preparation is done====");
128+
Console.WriteLine("");
129+
Console.WriteLine("original data path= {0}", originalDataPath);
130+
Console.WriteLine("");
131+
Console.WriteLine("prepared data path= {0}", preparedDataPath);
132+
Console.WriteLine("");
133+
}
134+
135+
private static void AddFeaturesColumn(string sourceFilePath,string preparedDataPath)
136+
{
137+
string sourceFileName = Path.GetFileName(sourceFilePath);
138+
string preparedFilePath = Path.Combine(preparedDataPath, sourceFileName);
139+
140+
//if the file does not exist in preparedFilePath then copy from sourceFilePath and then add new column
141+
if (!File.Exists(preparedFilePath))
142+
{
143+
File.Copy(sourceFilePath, preparedFilePath, true);
144+
}
145+
string newColumnData = "3231961";
146+
string[] CSVDump = File.ReadAllLines(preparedFilePath);
147+
List<List<string>> CSV = CSVDump.Select(x => x.Split(' ').ToList()).ToList();
148+
for (int i = 0; i < CSV.Count; i++)
149+
{
150+
CSV[i].Insert(1, newColumnData);
151+
}
152+
153+
File.WriteAllLines(preparedFilePath, CSV.Select(x => string.Join('\t', x)));
154+
}
155+
156+
public static string GetAbsolutePath(string relativePath)
157+
{
158+
FileInfo _dataRoot = new FileInfo(typeof(Program).Assembly.Location);
159+
string assemblyFolderPath = _dataRoot.Directory.FullName;
160+
161+
string fullPath = Path.Combine(assemblyFolderPath, relativePath);
162+
163+
return fullPath;
164+
}
165+
private static List<UrlData> CreateSingleDataSample(MLContext mlContext, IDataView dataView)
166+
{
167+
// Here (ModelInput object) you could provide new test data, hardcoded or from the end-user application, instead of the row from the file.
168+
List<UrlData> sampleForPredictions = mlContext.Data.CreateEnumerable<UrlData>(dataView, false).Take(4).ToList(); ;
169+
return sampleForPredictions;
170+
}
171+
}
172+
}

0 commit comments

Comments
 (0)