|
| 1 | +using Common; |
| 2 | +using ICSharpCode.SharpZipLib.GZip; |
| 3 | +using ICSharpCode.SharpZipLib.Tar; |
| 4 | +using Microsoft.ML; |
| 5 | +using System; |
| 6 | +using System.Collections.Generic; |
| 7 | +using System.IO; |
| 8 | +using System.Linq; |
| 9 | +using System.Net; |
| 10 | +using LargeDatasets.DataStructures; |
| 11 | +using static Microsoft.ML.DataOperationsCatalog; |
| 12 | + |
| 13 | +namespace LargeDatasets |
| 14 | +{ |
| 15 | + class Program |
| 16 | + { |
| 17 | + static string originalDataDirectoryRelativePath = @"../../../Data/OriginalUrlData"; |
| 18 | + static string originalDataReltivePath = @"../../../Data/OriginalUrlData/url_svmlight"; |
| 19 | + static string preparedDataReltivePath = @"../../../Data/PreparedUrlData/url_svmlight"; |
| 20 | + |
| 21 | + static string originalDataDirectoryPath = GetAbsolutePath(originalDataDirectoryRelativePath); |
| 22 | + static string originalDataPath = GetAbsolutePath(originalDataReltivePath); |
| 23 | + static string preparedDataPath = GetAbsolutePath(preparedDataReltivePath); |
| 24 | + static void Main(string[] args) |
| 25 | + { |
| 26 | + //STEP 1: Download dataset |
| 27 | + DownloadDataset(originalDataDirectoryPath); |
| 28 | + |
| 29 | + //Step 2: Prepare data by adding second column with value total number of features. |
| 30 | + PrepareDataset(originalDataPath, preparedDataPath); |
| 31 | + |
| 32 | + MLContext mlContext = new MLContext(); |
| 33 | + |
| 34 | + //STEP 3: Common data loading configuration |
| 35 | + var fullDataView = mlContext.Data.LoadFromTextFile<UrlData>(path: Path.Combine(preparedDataPath, "*"), |
| 36 | + hasHeader: false, |
| 37 | + allowSparse: true); |
| 38 | + |
| 39 | + //Step 4: Divide the whole dataset into 80% training and 20% testing data. |
| 40 | + TrainTestData trainTestData = mlContext.Data.TrainTestSplit(fullDataView, testFraction: 0.2, seed: 1); |
| 41 | + IDataView trainDataView = trainTestData.TrainSet; |
| 42 | + IDataView testDataView = trainTestData.TestSet; |
| 43 | + |
| 44 | + //Step 5: Map label value from string to bool |
| 45 | + var UrlLabelMap = new Dictionary<string, bool>(); |
| 46 | + UrlLabelMap["+1"] = true; //Malicious url |
| 47 | + UrlLabelMap["-1"] = false; //Benign |
| 48 | + var dataProcessingPipeLine = mlContext.Transforms.Conversion.MapValue("LabelKey", UrlLabelMap, "LabelColumn"); |
| 49 | + ConsoleHelper.PeekDataViewInConsole(mlContext, trainDataView, dataProcessingPipeLine, 2); |
| 50 | + |
| 51 | + //Step 6: Append trainer to pipeline |
| 52 | + var trainingPipeLine = dataProcessingPipeLine.Append( |
| 53 | + mlContext.BinaryClassification.Trainers.FieldAwareFactorizationMachine(labelColumnName: "LabelKey", featureColumnName: "FeatureVector")); |
| 54 | + |
| 55 | + //Step 7: Train the model |
| 56 | + Console.WriteLine("====Training the model====="); |
| 57 | + var mlModel = trainingPipeLine.Fit(trainDataView); |
| 58 | + Console.WriteLine("====Completed Training the model====="); |
| 59 | + Console.WriteLine(""); |
| 60 | + |
| 61 | + //Step 8: Evaluate the model |
| 62 | + Console.WriteLine("====Evaluating the model====="); |
| 63 | + var predictions = mlModel.Transform(testDataView); |
| 64 | + var metrics = mlContext.BinaryClassification.Evaluate(data: predictions, labelColumnName: "LabelKey", scoreColumnName: "Score"); |
| 65 | + ConsoleHelper.PrintBinaryClassificationMetrics(mlModel.ToString(),metrics); |
| 66 | + |
| 67 | + // Try a single prediction |
| 68 | + Console.WriteLine("====Predicting sample data====="); |
| 69 | + var predEngine = mlContext.Model.CreatePredictionEngine<UrlData, UrlPrediction>(mlModel); |
| 70 | + // Create sample data to do a single prediction with it |
| 71 | + var sampleDatas = CreateSingleDataSample(mlContext, trainDataView); |
| 72 | + foreach (var sampleData in sampleDatas) |
| 73 | + { |
| 74 | + UrlPrediction predictionResult = predEngine.Predict(sampleData); |
| 75 | + Console.WriteLine($"Single Prediction --> Actual value: {sampleData.LabelColumn} | Predicted value: {predictionResult.Prediction}"); |
| 76 | + } |
| 77 | + Console.WriteLine("====End of Process..Press any key to exit===="); |
| 78 | + Console.ReadLine(); |
| 79 | + } |
| 80 | + |
| 81 | + public static void DownloadDataset(string originalDataDirectoryPath) |
| 82 | + { |
| 83 | + if (!Directory.Exists(originalDataDirectoryPath)) |
| 84 | + { |
| 85 | + Console.WriteLine("====Downloading and extracting data===="); |
| 86 | + using (var client = new WebClient()) |
| 87 | + { |
| 88 | + //The code below will download a dataset from a third-party, UCI (link), and may be governed by separate third-party terms. |
| 89 | + //By proceeding, you agree to those separate terms. |
| 90 | + client.DownloadFile("https://archive.ics.uci.edu/ml/machine-learning-databases/url/url_svmlight.tar.gz", "url_svmlight.zip"); |
| 91 | + } |
| 92 | + |
| 93 | + Stream inputStream = File.OpenRead("url_svmlight.zip"); |
| 94 | + Stream gzipStream = new GZipInputStream(inputStream); |
| 95 | + TarArchive tarArchive = TarArchive.CreateInputTarArchive(gzipStream); |
| 96 | + tarArchive.ExtractContents(originalDataDirectoryPath); |
| 97 | + |
| 98 | + tarArchive.Close(); |
| 99 | + gzipStream.Close(); |
| 100 | + inputStream.Close(); |
| 101 | + Console.WriteLine("====Downloading and extracting is completed===="); |
| 102 | + } |
| 103 | + } |
| 104 | + |
| 105 | + private static void PrepareDataset(string originalDataPath,string preparedDataPath) |
| 106 | + { |
| 107 | + //Create folder for prepared Data path if it does not exist. |
| 108 | + if (!Directory.Exists(preparedDataPath)) |
| 109 | + { |
| 110 | + Directory.CreateDirectory(preparedDataPath); |
| 111 | + } |
| 112 | + Console.WriteLine("====Preparing Data===="); |
| 113 | + Console.WriteLine(""); |
| 114 | + //ML.Net API checks for number of features column before the sparse matrix format |
| 115 | + //So add total number of features i.e 3231961 as second column by taking all the files from originalDataPath |
| 116 | + //and save those files in preparedDataPath. |
| 117 | + if (Directory.GetFiles(preparedDataPath).Length == 0) |
| 118 | + { |
| 119 | + var ext = new List<string> { ".svm" }; |
| 120 | + var filesInDirectory = Directory.GetFiles(originalDataPath, "*.*", SearchOption.AllDirectories) |
| 121 | + .Where(s => ext.Contains(Path.GetExtension(s))); |
| 122 | + foreach (var file in filesInDirectory) |
| 123 | + { |
| 124 | + AddFeaturesColumn(Path.GetFullPath(file), preparedDataPath); |
| 125 | + } |
| 126 | + } |
| 127 | + Console.WriteLine("====Data Preparation is done===="); |
| 128 | + Console.WriteLine(""); |
| 129 | + Console.WriteLine("original data path= {0}", originalDataPath); |
| 130 | + Console.WriteLine(""); |
| 131 | + Console.WriteLine("prepared data path= {0}", preparedDataPath); |
| 132 | + Console.WriteLine(""); |
| 133 | + } |
| 134 | + |
| 135 | + private static void AddFeaturesColumn(string sourceFilePath,string preparedDataPath) |
| 136 | + { |
| 137 | + string sourceFileName = Path.GetFileName(sourceFilePath); |
| 138 | + string preparedFilePath = Path.Combine(preparedDataPath, sourceFileName); |
| 139 | + |
| 140 | + //if the file does not exist in preparedFilePath then copy from sourceFilePath and then add new column |
| 141 | + if (!File.Exists(preparedFilePath)) |
| 142 | + { |
| 143 | + File.Copy(sourceFilePath, preparedFilePath, true); |
| 144 | + } |
| 145 | + string newColumnData = "3231961"; |
| 146 | + string[] CSVDump = File.ReadAllLines(preparedFilePath); |
| 147 | + List<List<string>> CSV = CSVDump.Select(x => x.Split(' ').ToList()).ToList(); |
| 148 | + for (int i = 0; i < CSV.Count; i++) |
| 149 | + { |
| 150 | + CSV[i].Insert(1, newColumnData); |
| 151 | + } |
| 152 | + |
| 153 | + File.WriteAllLines(preparedFilePath, CSV.Select(x => string.Join('\t', x))); |
| 154 | + } |
| 155 | + |
| 156 | + public static string GetAbsolutePath(string relativePath) |
| 157 | + { |
| 158 | + FileInfo _dataRoot = new FileInfo(typeof(Program).Assembly.Location); |
| 159 | + string assemblyFolderPath = _dataRoot.Directory.FullName; |
| 160 | + |
| 161 | + string fullPath = Path.Combine(assemblyFolderPath, relativePath); |
| 162 | + |
| 163 | + return fullPath; |
| 164 | + } |
| 165 | + private static List<UrlData> CreateSingleDataSample(MLContext mlContext, IDataView dataView) |
| 166 | + { |
| 167 | + // Here (ModelInput object) you could provide new test data, hardcoded or from the end-user application, instead of the row from the file. |
| 168 | + List<UrlData> sampleForPredictions = mlContext.Data.CreateEnumerable<UrlData>(dataView, false).Take(4).ToList(); ; |
| 169 | + return sampleForPredictions; |
| 170 | + } |
| 171 | + } |
| 172 | +} |
0 commit comments