ex4 part 1 solution

Mark Linderman · Mark Linderman · commit c45ddf664ce8 · 2018-12-02T10:22:46.000-05:00
diff --git a/machine-learning-ex3/ex3/predict.m b/machine-learning-ex3/ex3/predict.m
@@ -36,8 +36,9 @@
 
 % had to ponder this for quite some time and then    I
 % had to write it down to visualize what I wanted:
-% basically, you're creating a new X with 
-% 25 rows and 401 columns (from 5000 rows and 401 columns)
+% basically, you're creating a new X (a2) with 
+% 5000 rows and 25 columns (from 5000 rows and 401 columns)
+% and then a new X (a3) with 5000 rows and 10 cols (the outputs)
 
 % Theta1:
 % 43, 54, 65, 67, 86 ...........  401
@@ -55,6 +56,10 @@
 % .
 % 5000
 
+% so, to line up theta1 and X up for multiplication,
+% transpose X so that the 1st Theta row will match up with 
+% the X columns
+
 
 % seems the desired result will reduce
 % the inputs from 401 features down to 25 (the number of inputs).
diff --git a/machine-learning-ex4/ex4/nnCostFunction.m b/machine-learning-ex4/ex4/nnCostFunction.m
@@ -23,6 +23,7 @@
                  num_labels, (hidden_layer_size + 1));
 size(Theta1)
 size(Theta2)
+size(X)
 
 % Setup some useful variables
 m = size(X, 1);
@@ -64,25 +65,39 @@
 %               and Theta2_grad from Part 2.
 %
 
-% add bias column of 1's to Theta1 and Theta2
-a1 = [ones(m,1), X];
-a2 = [ones(m,1), sigmoid(a1*transpose(Theta1))];
-a3 = sigmoid(a2*transpose(Theta2));
+% add bias column of 1's to X and call it a1
+% a1 = 5000 x 401
+% Theta1 = 25 x 401
+% Theta2 = 10 x 26
+
+% Note: multiplication of the Theta matrices with a(n) matrices
+% can be done two ways, the one implemented below requiring
+% fewer transformation
 
-% compute the cost by looping over a3 values and accumulating the costs for each output node (10 in this case), for each y 
+%a1 = [ones(m,1), X];
+%a2 = [ones(m,1), transpose(sigmoid(Theta1*transpose(a1)))];
+%a3 = transpose(sigmoid(Theta2*transpose(a2)));
+
+a1 = [ones(m,1), X];
+a2 = [ones(m,1), sigmoid(a1 * transpose(Theta1))];
+a3 = sigmoid(a2 * transpose(Theta2));
+size(a2)
+size(a3)
+size(y)
+
+% compute the cost by looping over a3 values and accumulating the 
+% costs for each output node (10 in this case), for each y 
 for i = 1:m 
     % map y values to a row vector of num_labels length so that you can compare to output nodes while computing cost
     yvector = zeros(1,num_labels);
     yvector(1, y(i)) = 1;
     for j = 1:num_labels
-        % log(a3(j))
         % yvector and a3 should both be a row vector of size num_labels
-        J += -yvector(j)*log(a3(j)) - (1 - yvector(j))*log(1 - a3(j));
+        J += -yvector(j)*log(a3(i,j)) - (1 - yvector(j))*log(1 - a3(i,j));
     end
 end
 
 J = (1/m) * J;
-