Finished SemiSupervised tests #175

dpuenteramirez · dpuenteramirez · commit 14c92d705df5 · 2022-04-16T16:39:03.000+02:00
diff --git a/semisupervised/CoTraining.py b/semisupervised/CoTraining.py
@@ -10,6 +10,7 @@
 import numpy as np
 from sklearn.naive_bayes import GaussianNB
 from sklearn.preprocessing import LabelEncoder
+
 from .utils import split
 
 
@@ -45,13 +46,10 @@ def __init__(self, p=1, n=3, k=30, u=75, random_state=None,
         self.h1, self.h2 = configs
 
     def fit(self, samples, y):
-        labeled, u, y = split(samples, y)
-
-        if len(labeled) != len(y):
-            raise ValueError(
-                f'The dimension of the labeled data must be the same as the '
-                f'number of labels given. {len(labeled)} != {len(y)}'
-            )
+        try:
+            labeled, u, y = split(samples, y)
+        except IndexError:
+            raise ValueError('Dimensions do not match.')
 
         le = LabelEncoder()
         le.fit(y)
@@ -61,8 +59,12 @@ def fit(self, samples, y):
         self.size_x1 = ceil(len(labeled[0]) / 2)
 
         rng = np.random.default_rng()
-        u_random_index = rng.choice(len(u), size=floor(self.u),
-                                    replace=False, shuffle=False)
+        try:
+            u_random_index = rng.choice(len(u), size=floor(self.u),
+                                        replace=False, shuffle=False)
+        except ValueError:
+            raise ValueError('The model was incorrectly parametrized, '
+                             'total between _p_ and _u_ is to big.')
 
         u_prime = u[u_random_index]
         u1, u2 = np.array_split(u_prime, 2, axis=1)
@@ -105,17 +107,16 @@ def fit(self, samples, y):
             u_prime = np.delete(u_prime, old_indexes, axis=0)
 
             u = np.delete(u, u_random_index, axis=0)
+
             try:
                 u_random_index = rng.choice(len(u),
                                             size=2 * self.p + 2 * self.n,
                                             replace=False, shuffle=False)
             except ValueError:
-                print('The model was incorrectly parametrized, k is to big.')
-            try:
-                u_prime = np.concatenate((u_prime, u[u_random_index]))
-            except IndexError:
-                print('The model was incorrectly parametrized, there are not '
-                      'enough unlabeled samples.')
+                raise ValueError('The model was incorrectly parametrized, '
+                                 'total between _p_ and _u_ is to big.')
+
+            u_prime = np.concatenate((u_prime, u[u_random_index]))
 
     def predict(self, samples):
         x1, x2 = np.array_split(samples, 2, axis=1)
diff --git a/semisupervised/DemocraticCoLearning.py b/semisupervised/DemocraticCoLearning.py
@@ -69,18 +69,15 @@ def __init__(self, random_state=None,
         self.h1, self.h2, self.h3 = configs
 
     def fit(self, samples, y):
-        labeled, u, y = split(samples, y)
-
-        if len(labeled) != len(y):
-            raise ValueError(
-                f'The dimension of the labeled data must be the same as the '
-                f'number of labels given. {len(labeled)} != {len(y)}'
-            )
+        try:
+            labeled, u, y = split(samples, y)
+        except IndexError:
+            raise ValueError('Dimensions do not match.')
 
         le = LabelEncoder()
         le.fit(y)
         y = le.transform(y)
-        self.n_labels = max(np.unique(y))+1
+        self.n_labels = max(np.unique(y)) + 1
 
         unlabeled_data = u
         self.n_attributes = len(labeled[0])
diff --git a/semisupervised/DensityPeaks.py b/semisupervised/DensityPeaks.py
@@ -3,7 +3,7 @@
 # @Filename:    DensityPeaks.py
 # @Author:      Daniel Puente Ramírez
 # @Time:        5/3/22 09:55
-# @Version:     3.1
+# @Version:     4.0
 
 import math
 from collections import defaultdict
@@ -15,6 +15,9 @@
 from sklearn.semi_supervised import SelfTrainingClassifier
 from sklearn.svm import SVC
 
+from instance_selection import ENN
+from .utils import split
+
 
 class STDPNF:
     """
@@ -38,6 +41,8 @@ def __init__(self,
                  anormal=True,
                  filtering=False,
                  classifier=None,
+                 classifier_params=None,
+                 filter_method=None
                  ):
         """Semi Supervised Algorithm based on Density Peaks."""
         self.dc = dc
@@ -49,7 +54,19 @@ def __init__(self,
         self.distance_threshold = distance_threshold
         self.anormal = anormal
         self.filtering = filtering
-        self.classifier = classifier
+        if classifier is not None:
+            if isinstance(classifier_params, dict):
+                self.classifier = classifier(**classifier_params)
+            else:
+                self.classifier = classifier()
+        else:
+            self.classifier = None
+        if filter_method is not None and filter_method != 'ENANE':
+            self.filter = filter_method()
+        elif isinstance(filter_method, str) and filter_method == 'ENANE':
+            self.filter = filter_method
+        else:
+            self.filter = None
 
     def __build_distance(self):
         """
@@ -87,7 +104,7 @@ def __auto_select_dc(self):
         while True:
             nneighs = sum(
                 [1 for v in self.distances.values() if v < dc]) / self.n_id ** 2
-            if 0.01 <= nneighs <= 0.002:
+            if 0.01 <= nneighs <= 0.02:
                 break
             # binary search
             if nneighs < 0.01:
@@ -333,18 +350,46 @@ def _fit_stdpnf(self):
 
         while count <= max(self.order.values()):
             unlabeled_rows = self.structure_stdnpf.loc[self.structure_stdnpf[
-                                                           'label'] == -1].\
+                                                           'label'] == -1]. \
                 index.to_list()
             unlabeled_indexes = []
             for row in unlabeled_rows:
                 if self.order[row] == count:
                     unlabeled_indexes.append(row)
 
-            filtered_indexes, filtered_labels = self.__enane(
-                unlabeled_indexes, nan, lambda_param)
+            if isinstance(self.filter, str) and self.filter == 'ENANE':
+                filtered_indexes, filtered_labels = self.__enane(
+                    unlabeled_indexes, nan, lambda_param)
+                self.structure_stdnpf.at[filtered_indexes, 'label'] = \
+                    filtered_labels
 
-            self.structure_stdnpf.at[filtered_indexes, 'label'] = \
-                filtered_labels
+            else:
+                labeled_data = self.structure_stdnpf.loc[self.structure_stdnpf[
+                                                             'label'] != -1]
+                complete = labeled_data['sample']
+                complete_y = labeled_data['label']
+
+                if isinstance(self.filter, ENN):
+                    original = pd.DataFrame(self.l)
+                    original_y = pd.DataFrame(self.y)
+                    result, _ = self.filter.filter_original_complete(
+                        original, original_y, complete, complete_y)
+                else:
+                    result, _ = self.filter.filter(complete, complete_y)
+
+                results_to_unlabeled = []
+                for r in result.to_numpy():
+                    is_in = False
+                    for c in complete:
+                        if np.array_equal(r, c):
+                            is_in = True
+                    if not is_in:
+                        results_to_unlabeled.append(r)
+
+                for r in results_to_unlabeled:
+                    self.structure_stdnpf.at[
+                        np.array(self.structure_stdnpf['sample'],
+                                 r)]['label'] = -1
 
             labeled_data = self.structure_stdnpf.loc[self.structure_stdnpf[
                                                          'label'] != -1]
@@ -358,13 +403,12 @@ def _fit_stdpnf(self):
         self.classifier_stdpnf.fit(
             labeled_data['sample'].tolist(), labeled_data['label'].tolist())
 
-    def fit(self, l, u, y):
+    def fit(self, samples, y):
         """Fit method."""
-        if len(l) != len(y):
-            raise ValueError(
-                f'The dimension of the labeled data must be the same as the '
-                f'number of labels given. {len(l)} != {len(y)}'
-            )
+        try:
+            l, u, y = split(samples, y)
+        except IndexError:
+            raise ValueError('Dimensions do not match.')
 
         le = LabelEncoder()
         le.fit(y)
diff --git a/semisupervised/TriTraining.py b/semisupervised/TriTraining.py
@@ -66,13 +66,10 @@ def subsample(self, l_t, s):
         return Bunch(data=samples, target=targets)
 
     def fit(self, samples, y):
-        labeled, u, y = split(samples, y)
-
-        if len(labeled) != len(y):
-            raise ValueError(
-                f'The dimension of the labeled data must be the same as the '
-                f'number of labels given. {len(labeled)} != {len(y)}'
-            )
+        try:
+            labeled, u, y = split(samples, y)
+        except IndexError:
+            raise ValueError('Dimensions do not match.')
 
         le = LabelEncoder()
         le.fit(y)
diff --git a/semisupervised/__init__.py b/semisupervised/__init__.py
@@ -16,9 +16,9 @@
 __author__ = 'Daniel Puente Ramírez'
 
 from .CoTraining import CoTraining
-from .TriTraining import TriTraining
 from .DemocraticCoLearning import DemocraticCoLearning
 from .DensityPeaks import STDPNF
+from .TriTraining import TriTraining
 
 __all__ = ["CoTraining",
            "TriTraining",
diff --git a/semisupervised/utils/__init__.py b/semisupervised/utils/__init__.py
@@ -2,4 +2,4 @@
 
 __all__ = [
     "split"
-]
+]
diff --git a/tests/InstanceSelection.py b/tests/InstanceSelection.py
@@ -17,7 +17,6 @@
 def to_dataframe(y):
     if not isinstance(y, pd.DataFrame):
         return pd.DataFrame(y)
-    return y
 
 
 @pytest.fixture
diff --git a/tests/SemiSupervised.py b/tests/SemiSupervised.py
@@ -6,22 +6,18 @@
 
 import random
 
-import pytest
 import numpy as np
 import pandas as pd
+import pytest
 from sklearn.datasets import load_iris as load_digits
 from sklearn.model_selection import train_test_split
+from sklearn.neighbors import KNeighborsClassifier
 
+from instance_selection import ENN
 from semisupervised import STDPNF, CoTraining, TriTraining, \
     DemocraticCoLearning
 
 
-def to_dataframe(y):
-    if not isinstance(y, pd.DataFrame):
-        return pd.DataFrame(y)
-    return y
-
-
 @pytest.fixture
 def digits_dataset_ss():
     x, y = load_digits(return_X_y=True, as_frame=True)
@@ -36,7 +32,7 @@ def digits_dataset_ss():
     y_train = pd.DataFrame(y_train)
     y_test = pd.DataFrame(y_test)
     li = list(set(range(x_train.shape[0])))
-    unlabeled = random.sample(li, int(x_train.shape[0] * 0.3))
+    unlabeled = random.sample(li, int(x_train.shape[0] * 0.55))
     y_train.loc[unlabeled] = -1
 
     return x_train, x_test, y_train, y_test, opt_labels
@@ -57,19 +53,68 @@ def base(x_train, x_test, y_train, y_test, opt_labels, algorithm, params=None):
 
 def test_co_training(digits_dataset_ss):
     x_train, x_test, y_train, y_test, opt_labels = digits_dataset_ss
-    base(x_train, x_test, y_train, y_test, opt_labels, CoTraining)
+    base(x_train, x_test, y_train, y_test, opt_labels, CoTraining,
+         {'p': 1, 'n': 3, 'k': 1, 'u': 7})
+    base(x_train, x_test, y_train, y_test, opt_labels, CoTraining,
+         {'p': 1, 'n': 3, 'k': 1, 'u': 7,
+          'c1': KNeighborsClassifier, 'c1_params': {'n_neighbors': 3},
+          'c2': KNeighborsClassifier})
+
+    with pytest.raises(ValueError):
+        base(x_train, x_test, y_train, y_test, opt_labels, CoTraining)
+
+    with pytest.raises(ValueError):
+        base(x_train, x_test, y_train, y_test, opt_labels, CoTraining,
+             {'p': 1, 'n': 3, 'k': 100, 'u': 7})
+
+    with pytest.raises(ValueError):
+        base(x_train, x_test, y_train, y_test, opt_labels, CoTraining,
+             {'p': 5, 'n': 5, 'k': 100, 'u': 15})
 
 
 def test_tri_training(digits_dataset_ss):
     x_train, x_test, y_train, y_test, opt_labels = digits_dataset_ss
-    base(x_train, x_test, y_train, y_test, opt_labels, TriTraining)
+    base(x_train, x_test, y_train, y_test, opt_labels, TriTraining,
+         {'c1': KNeighborsClassifier, 'c1_params': {'n_neighbors': 3},
+          'c2': KNeighborsClassifier})
 
 
 def test_demo_co_learning(digits_dataset_ss):
     x_train, x_test, y_train, y_test, opt_labels = digits_dataset_ss
     base(x_train, x_test, y_train, y_test, opt_labels, DemocraticCoLearning)
+    base(x_train, x_test, y_train, y_test, opt_labels, DemocraticCoLearning,
+         {'c1': KNeighborsClassifier, 'c1_params': {'n_neighbors': 3},
+          'c2': KNeighborsClassifier})
 
 
 def test_density_peaks(digits_dataset_ss):
     x_train, x_test, y_train, y_test, opt_labels = digits_dataset_ss
     base(x_train, x_test, y_train, y_test, opt_labels, STDPNF)
+
+
+def test_density_peaks_filtering(digits_dataset_ss):
+    x_train, x_test, y_train, y_test, opt_labels = digits_dataset_ss
+    with pytest.raises(AttributeError):
+        base(x_train, x_test, y_train, y_test, opt_labels, STDPNF,
+             {'filtering': True})
+    base(x_train, x_test, y_train, y_test, opt_labels, STDPNF,
+         {'filtering': True, 'filter_method': 'ENANE'})
+
+    base(x_train, x_test, y_train, y_test, opt_labels, STDPNF,
+         {'filtering': True, 'filter_method': ENN, 'dc': 'auto',
+          'classifier': KNeighborsClassifier})
+
+
+def test_different_len(digits_dataset_ss):
+    x, _, y, _, _ = digits_dataset_ss
+    co = CoTraining()
+    tri = TriTraining()
+    demo_co = DemocraticCoLearning()
+    stdpnf = STDPNF()
+
+    models = [co, tri, demo_co, stdpnf]
+    y = y[:-1]
+
+    for model in models:
+        with pytest.raises(ValueError):
+            model.fit(x, y)

-Original file line number
+Diff line change
 __all__ = [
     "split"
 -]
 +]