|
| 1 | +clc; |
| 2 | +clear; |
| 3 | + |
| 4 | +function costs = clusterVariationCosts(X) |
| 5 | + K = size(X, 1); |
| 6 | + for i = 1:K |
| 7 | + [~, minCost] = multipleKMeans(X, i, 5); |
| 8 | + costs(i) = minCost; |
| 9 | + endfor |
| 10 | +endfunction |
| 11 | + |
| 12 | +function [kMeans, minCost] = multipleKMeans(X, K, iters) |
| 13 | + n = size(X, 2); |
| 14 | + kMeans = zeros(iters, K, n); |
| 15 | + cost = zeros(iters, 1); |
| 16 | + for i = 1:iters |
| 17 | + [clusters, J] = kMeansClustering(X, K); |
| 18 | + kMeans(i, :, :) = clusters; |
| 19 | + cost(i) = J; |
| 20 | + endfor |
| 21 | + [minCost, index] = min(cost); |
| 22 | + kMeans = reshape(kMeans(index, :, :), K, n); |
| 23 | +endfunction |
| 24 | + |
| 25 | +function [clusters, minCost] = kMeansClustering(X, K) |
| 26 | + m = size(X, 1); |
| 27 | + n = size(X, 2); |
| 28 | + |
| 29 | + % randomly generate K clusters and positions |
| 30 | + randomizedDataSet = randperm(m); |
| 31 | + clusters = X(randomizedDataSet(1:K), :); |
| 32 | + |
| 33 | + % Run the K-means clustering algorithm |
| 34 | + for iter = 1:10 |
| 35 | + pointGroup = distanceFromClusterPoints(X, K, clusters); |
| 36 | + [minDistace, index] = min(pointGroup, [], 2); |
| 37 | + cost = (1 / m) * sum(minDistace); |
| 38 | + clusters = centroids(X, K, index, clusters); |
| 39 | + endfor |
| 40 | + minCost = cost; |
| 41 | +endfunction |
| 42 | + |
| 43 | +function mat = distanceFromClusterPoints(X, K, clusters) |
| 44 | + m = size(X, 1); |
| 45 | + mat = zeros(m, K); |
| 46 | + for i = 1:K |
| 47 | + mat(:, i) = euclideanDistanceSquare(X, clusters(i, :)); |
| 48 | + endfor |
| 49 | +endfunction |
| 50 | + |
| 51 | +function c = centroids(X, K, clusterData, clusters) |
| 52 | + n = size(X, 2); |
| 53 | + m = size(X, 1); |
| 54 | + c = zeros(K, n); |
| 55 | + frequency = zeros(K, 1); |
| 56 | + for i = 1:m |
| 57 | + frequency(clusterData(i))++; |
| 58 | + c(clusterData(i), :) += X(i, :); |
| 59 | + endfor |
| 60 | + mask = c == 0; |
| 61 | + frequency = maskZeroAsOne(frequency); |
| 62 | + c = c ./ frequency; |
| 63 | + c += mask .* clusters; |
| 64 | +endfunction |
| 65 | + |
| 66 | +function mat = maskZeroAsOne(mat) |
| 67 | + mask = mat == 0; |
| 68 | + mat += mask; |
| 69 | +endfunction |
| 70 | + |
| 71 | +function d = euclideanDistanceSquare(X, cluster) |
| 72 | + trainingDataSize = size(X, 1); |
| 73 | + similarityMatrix = repelem(cluster, trainingDataSize, 1); |
| 74 | + difference = X - similarityMatrix; |
| 75 | + d = sum(difference .^ 2, 2); |
| 76 | +endfunction |
| 77 | + |
| 78 | +labels = 5; |
| 79 | +data = [-10 1 ; 11 2 ; 45 3 ; 4 4 ; 7 5 ; 100 100 ; 5 -8 ; -89 23]; |
| 80 | +% disp('K means clustering - single'); |
| 81 | +% disp(kMeansClustering(data, labels)); |
| 82 | + |
| 83 | +disp('multiple k means'); |
| 84 | +[kMeans, minCost] = multipleKMeans(data, labels, 10); |
| 85 | +disp('min cost'); disp(minCost); |
| 86 | +disp('k means'); disp(kMeans); |
| 87 | + |
| 88 | +plot(clusterVariationCosts(data)); |
0 commit comments