New examples with save/load are working.

NiklasGustafsson · NiklasGustafsson · commit 5232b4f89fcc · 2021-04-30T12:45:01.000-07:00
diff --git a/docfx/articles/memory.md b/docfx/articles/memory.md
@@ -8,6 +8,8 @@ Two approaches are available for memory management. Technique 1 is the default a
 
 Note DiffSharp (which uses TorchSharp) relies on techniques 1.
 
+> Most of the examples included will use technique #1, doing frequent explicit calls to GC.Collect() in the training code -- if not after each batch in the training loop, at least after each epoch.
+
 ## Technique 1. Implicit disposal using finalizers
 
 In this technique all tensors (CPU and GPU) are implicitly disposed via .NET finalizers.
@@ -21,19 +23,26 @@ This is not yet done when using general tensor operations.  It is possible a mor
 
 👎 The .NET GC doesn't know of the memory pressure from CPU tensors, so failure may happen if large tensors can't be allocated
 
-👎 The .NET GC doesn't know of GPU resources
+👎 The .NET GC doesn't know of GPU resources.
+
+👎 Native operations that allocate temporaries, whether on CPU or GPU, may fail -- the GC scheme implemented by TorchSharp only works when the allocation is initiated by .NET code.
 
 ## Technique 2. Explicit disposal
 
 In this technique specific tensors (CPU and GPU) are explicitly disposed
 using `using` in C# or explicit calls to `System.IDisposable.Dispose()`.
 
-👍 control
+👍 Specific lifetime management of all resources.
+
+👎 Cumbersome, requiring lots of using statements in your code.
 
-👎 you must know when to dispose
+👎 You must know when to dispose.
+
+👎 Temporaries are not covered by this approach, so to maximize the benefit, you may have to store all temporaries to variables and dispose.
 
 > NOTE: Disposing a tensor only releases the underlying storage if this is the last
-> live TorchTensor which has a view on that tensor.
+> live TorchTensor which has a view on that tensor -- the native runtime does reference counting of tensors.
+
 
 ## Links and resources
 
diff --git a/docfx/articles/saveload.md b/docfx/articles/saveload.md
@@ -0,0 +1,56 @@
+# Saving and Restoring Models
+
+When using PyTorch, the expected pattern to use when saving and later restoring models from disk or other permanent storage media, is to get the model's state and pickle that using the standard Python format.
+
+```Python
+torch.save(model.state_dict(), 'model_weights.pth')
+```
+
+When restoring the model, you are expected to first create a model of the exact same structure as the original, with random weights, then restore the state:
+
+```Python
+model = [...]
+model.load_state_dict(torch.load('model_weights.pth'))
+```
+
+This presents a couple of problems for a .NET implementation. First, Python pickling is very intimately coupled with Python and its runtime object model. It is a complex format that supports object graphs that form DAGs, and faithfully maintaining all object state.
+
+Second, in order to share models between .NET applications, Python pickling is not necessary, and even for moving model state from Python to .NET, it is overkill. The state of a model is a simple dictionary where the keys are strings and the values are tensors.
+
+Therefore, TorchSharp in its current form, implements its own very simple model serialization format, which allows models originating in either .NET or Python to be loaded using .NET, as long as the model was saved using the special format.
+
+The MNIST and AdversarialExampleGeneration examples in this repo rely on saving and restoring model state -- the latter example relies on a pre-trained model from MNST.
+
+> A future version of TorchSharp may include support for reading and writing Python pickle files directly. There are 
+
+## How to use the TorchSharp format
+
+
+In C#, saving a model looks like this:
+
+```C#
+model.save("model_weights.dat");
+```
+
+It's important to note that calling 'save' will move the model to the CPU, where it remains after the call. If you need to continue to use the model after saving it, you will have to explicitly move it back:
+
+```C#
+model.to(Device.CUDA);
+```
+
+And loading it again is done by:
+
+```C#
+model = [...];
+model.load("model_weights.dat");
+```
+
+The model should be created on the CPU before loading weights, then moved to the target device.
+
+If the model starts out in Python, there's a simple script that allows you to use code that is very similar to the Pytorch API to save models to the TorchSharp format. Rather than placing this trivial script in a Python package and publishing it, we choose to just refer you to the script file itself, [exportsd.py](../src/Python/exportsd.py), which has all the necessary code.
+
+```Python
+f = open("model_weights.dat", "wb")
+exportsd.save_state_dict(model.to("cpu").state_dict(), f)
+f.close()
+```
diff --git a/src/Examples/AdversarialExampleGeneration.cs b/src/Examples/AdversarialExampleGeneration.cs
@@ -18,7 +18,9 @@
 namespace TorchSharp.Examples
 {
     /// <summary>
-    /// Simple MNIST Convolutional model.
+    /// FGSM Attack
+    ///
+    /// Based on : https://pytorch.org/tutorials/beginner/fgsm_tutorial.html
     /// </summary>
     /// <remarks>
     /// There are at least two interesting data sets to use with this example:
@@ -34,6 +36,13 @@ namespace TorchSharp.Examples
     ///
     /// In each case, there are four .gz files to download. Place them in a folder and then point the '_dataLocation'
     /// constant below at the folder location.
+    ///
+    /// The example is based on the PyTorch tutorial, but the results from attacking the model are very different from
+    /// what the tutorial article notes, at least on the machine where it was developed. There is an order-of-magnitude lower
+    /// drop-off in accuracy in this version. That said, when running the PyTorch tutorial on the same machine, the
+    /// accuracy trajectories are the same between .NET and Python. If the base convulutational model is trained
+    /// using Python, and then used for the FGSM attack in both .NET and Python, the drop-off trajectories are extremenly
+    /// close.
     /// </remarks>
     public class AdversarialExampleGeneration
     {
@@ -74,114 +83,78 @@ static void Main(string[] args)
                 Utils.Decompress.DecompressGZipFile(Path.Combine(sourceDir, "t10k-labels-idx1-ubyte.gz"), targetDir);
             }
 
+            MNIST.Model model = null;
+
             var normImage = TorchVision.Transforms.Normalize(new double[] { 0.1307 }, new double[] { 0.3081 }, device: device);
 
-            using (var train = new MNISTReader(targetDir, "train", _trainBatchSize, device: device, shuffle: true, transform: normImage))
             using (var test = new MNISTReader(targetDir, "t10k", _testBatchSize, device: device, transform: normImage)) {
 
-                var model = new Model("model", Device.CPU);
-
                 var modelFile = dataset + ".model.bin";
 
                 if (!File.Exists(modelFile)) {
                     // We need the model to be trained first, because we want to start with a trained model.
                     Console.WriteLine($"\n  Running MNIST on {device.Type.ToString()} in order to pre-train the model.");
-                    MNIST.TrainingLoop(dataset, device, train, test);
-                    Console.WriteLine("Moving on to the Adversarial model.\n");
-                }
 
-                model.load(modelFile);
-                model.to(device);
+                    model = new MNIST.Model("model", device);
 
-                // Establish a baseline accuracy.
+                    using (MNISTReader train = new MNISTReader(targetDir, "train", _trainBatchSize, device: device, shuffle: true, transform: normImage)) {
+                        MNIST.TrainingLoop(dataset, device, model, train, test);
+                    }
 
-                Stopwatch sw = new Stopwatch();
-                sw.Start();
+                    Console.WriteLine("Moving on to the Adversarial model.\n");
 
-                var baseline = TestBaseline(model, nll_loss(reduction: NN.Reduction.Sum), test, test.Size);
+                } else {
+                    model = new MNIST.Model("model", Device.CPU);
+                    model.load(modelFile);
+                }
 
-                Console.WriteLine($"\rBaseline model accuracy: {baseline}");
+                model.to(device);
+                model.Eval();
 
-                sw.Stop();
-                Console.WriteLine($"Elapsed time: {sw.Elapsed.TotalSeconds} s.");
+                var epsilons = new double[] { 0, 0.05, 0.1, 0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50 };
 
-                GC.Collect();
+                foreach (var ε in epsilons) {
+                    var attacked = Test(model, nll_loss(), ε, test, test.Size);
+                    Console.WriteLine($"Epsilon: {ε:F2}, accuracy: {attacked:P2}");
+                }
             }
         }
 
-        private class Model : CustomModule
+        private static TorchTensor Attack(TorchTensor image, double ε, TorchTensor data_grad)
         {
-            private Conv2d conv1 = Conv2d(1, 32, 3);
-            private Conv2d conv2 = Conv2d(32, 64, 3);
-            private Linear fc1 = Linear(9216, 128);
-            private Linear fc2 = Linear(128, 10);
-
-            // These don't have any parameters, so the only reason to instantiate
-            // them is performance, since they will be used over and over.
-            private MaxPool2d pool1 = MaxPool2d(kernelSize: new long[] { 2, 2 });
-
-            private ReLU relu1 = ReLU();
-            private ReLU relu2 = ReLU();
-            private ReLU relu3 = ReLU();
-
-            private FeatureAlphaDropout dropout1 = FeatureAlphaDropout();
-            private Dropout dropout2 = Dropout();
-
-            private Flatten flatten = Flatten();
-            private LogSoftmax logsm = LogSoftmax(1);
-
-
-            public Model(string name, Device device = null) : base(name)
-            {
-                RegisterComponents();
-
-                if (device != null && device.Type == DeviceType.CUDA)
-                    this.to(device);
-            }
-
-            public override TorchTensor forward(TorchTensor input)
-            {
-                var l11 = conv1.forward(input);
-                var l12 = relu2.forward(l11);
-
-                var l21 = conv2.forward(l12);
-                var l22 = pool1.forward(l21);
-                var l23 = dropout1.forward(l22);
-                var l24 = relu2.forward(l23);
-
-                var x = flatten.forward(l24);
-
-                var l31 = fc1.forward(x);
-                var l32 = relu3.forward(l31);
-                var l33 = dropout2.forward(l32);
-
-                var l41 = fc2.forward(l33);
-
-                return logsm.forward(l41);
+            using (var sign = data_grad.sign()) {
+                var perturbed = (image + ε * sign).clamp(0.0, 1.0);
+                return perturbed;
             }
         }
 
-        private static double TestBaseline(
-            Model model,
-            Loss loss,
+        private static double Test(
+            MNIST.Model model,
+            Loss criterion,
+            double ε,
             IEnumerable<(TorchTensor, TorchTensor)> dataLoader,
             long size)
         {
-            model.Eval();
-
-            double testLoss = 0;
             int correct = 0;
 
-            foreach (var (data, target) in dataLoader)
-            {
-                var prediction = model.forward(data);
-                var output = loss(prediction, target);
-                testLoss += output.ToSingle();
+            foreach (var (data, target) in dataLoader) {
+
+                data.requires_grad = true;
+
+                using (var output = model.forward(data))
+                using (var loss = criterion(output, target)) {
 
-                var pred = prediction.argmax(1);
-                correct += pred.eq(target).sum().ToInt32();
+                    model.ZeroGrad();
+                    loss.backward();
+
+                    var perturbed = Attack(data, ε, data.grad());
+
+                    using (var final = model.forward(perturbed)) {
+
+                        correct += final.argmax(1).eq(target).sum().ToInt32();
+                    }
+                }
 
-                pred.Dispose();
 
                 GC.Collect();
             }
diff --git a/src/Examples/Examples.csproj b/src/Examples/Examples.csproj
@@ -8,7 +8,7 @@
     <TestUsesLibTorch>true</TestUsesLibTorch>
     <UseMLCodeAnalyzer>false</UseMLCodeAnalyzer>
     <UseStyleCopAnalyzer>false</UseStyleCopAnalyzer>
-    <StartupObject>TorchSharp.Examples.MNIST</StartupObject>
+    <StartupObject>TorchSharp.Examples.AdversarialExampleGeneration</StartupObject>
     <IsPackable>false</IsPackable>
     <PlatformTarget>x64</PlatformTarget>
     <RootNamespace>TorchSharp.Examples</RootNamespace>
diff --git a/src/Examples/MNIST.cs b/src/Examples/MNIST.cs
@@ -50,7 +50,6 @@ static void Main(string[] args)
 
             var cwd = Environment.CurrentDirectory;
 
-            //var device = Device.CPU;
             var device = Torch.IsCudaAvailable() ? Device.CUDA : Device.CPU;
             Console.WriteLine($"Running MNIST on {device.Type.ToString()}");
             Console.WriteLine($"Dataset: {dataset}");
@@ -69,21 +68,25 @@ static void Main(string[] args)
             if (device.Type == DeviceType.CUDA) {
                 _trainBatchSize *= 4;
                 _testBatchSize *= 4;
-                _epochs *= 4;
             }
 
+            var model = new Model("model", device);
+
             var normImage = TorchVision.Transforms.Normalize(new double[] { 0.1307 }, new double[] { 0.3081 }, device: device);
 
             using (MNISTReader train = new MNISTReader(targetDir, "train", _trainBatchSize, device: device, shuffle: true, transform: normImage),
                                 test = new MNISTReader(targetDir, "t10k", _testBatchSize, device: device, transform: normImage)) {
 
-                TrainingLoop(dataset, device, train, test);
+                TrainingLoop(dataset, device, model, train, test);
             }
         }
 
-        internal static void TrainingLoop(string dataset, Device device, MNISTReader train, MNISTReader test)
+        internal static void TrainingLoop(string dataset, Device device, Model model, MNISTReader train, MNISTReader test)
         {
-            var model = new Model("model", device);
+            if (device.Type == DeviceType.CUDA) {
+                _epochs *= 4;
+            }
+
             var optimizer = NN.Optimizer.Adam(model.parameters());
 
             var scheduler = NN.Optimizer.StepLR(optimizer, 1, 0.7, last_epoch: 5);
@@ -100,13 +103,13 @@ internal static void TrainingLoop(string dataset, Device device, MNISTReader tra
             }
 
             sw.Stop();
-            Console.WriteLine($"Elapsed time: {sw.Elapsed.TotalSeconds} s.");
+            Console.WriteLine($"Elapsed time: {sw.Elapsed.TotalSeconds:F1} s.");
 
             Console.WriteLine("Saving model to '{0}'", dataset + ".model.bin");
             model.save(dataset + ".model.bin");
         }
 
-        private class Model : CustomModule
+        internal class Model : CustomModule
         {
             private Conv2d conv1 = Conv2d(1, 32, 3);
             private Conv2d conv2 = Conv2d(32, 64, 3);
@@ -121,8 +124,8 @@ private class Model : CustomModule
             private ReLU relu2 = ReLU();
             private ReLU relu3 = ReLU();
 
-            private FeatureAlphaDropout dropout1 = FeatureAlphaDropout();
-            private Dropout dropout2 = Dropout();
+            private Dropout dropout1 = Dropout(0.25);
+            private Dropout dropout2 = Dropout(0.5);
 
             private Flatten flatten = Flatten();
             private LogSoftmax logsm = LogSoftmax(1);
@@ -141,9 +144,9 @@ public override TorchTensor forward(TorchTensor input)
                 var l12 = relu2.forward(l11);
 
                 var l21 = conv2.forward(l12);
-                var l22 = pool1.forward(l21);
-                var l23 = dropout1.forward(l22);
-                var l24 = relu2.forward(l23);
+                var l22 = relu2.forward(l21);
+                var l23 = pool1.forward(l22);
+                var l24 = dropout1.forward(l23);
 
                 var x = flatten.forward(l24);
 
@@ -184,7 +187,7 @@ private static void Train(
                 optimizer.step();
 
                 if (batchId % _logInterval == 0) {
-                    Console.WriteLine($"\rTrain: epoch {epoch} [{batchId * batchSize} / {size}] Loss: {output.ToSingle()}");
+                    Console.WriteLine($"\rTrain: epoch {epoch} [{batchId * batchSize} / {size}] Loss: {output.ToSingle():F4}");
                 }
 
                 batchId++;
@@ -220,7 +223,7 @@ private static void Test(
 
             Console.WriteLine($"Size: {size}, Total: {size}");
 
-            Console.WriteLine($"\rTest set: Average loss {testLoss / size} | Accuracy {(double)correct / size}");
+            Console.WriteLine($"\rTest set: Average loss {(testLoss / size):F4} | Accuracy {((double)correct / size):P2}");
         }
     }
 }
diff --git a/src/Examples/MNISTReader.cs b/src/Examples/MNISTReader.cs
@@ -92,7 +92,7 @@ public MNISTReader(string path, string prefix, int batch_size = 32, bool shuffle
                     var idx = indices[i++];
                     var imgStart = idx * imgSize;
 
-                    var floats = dataBytes[imgStart.. (imgStart+imgSize)].Select(b => (float)b).ToArray();
+                    var floats = dataBytes[imgStart.. (imgStart+imgSize)].Select(b => b/256.0f).ToArray();
                     using (var inputTensor = Float32Tensor.from(floats))
                         dataTensor.index_put_(new TorchTensorIndex [] { TorchTensorIndex.Single(j) }, inputTensor);
                     lablTensor[j] = Int64Tensor.from(labelBytes[idx]);
diff --git a/src/Python/exportsd.py b/src/Python/exportsd.py