Skip to content

Commit 14c92d7

Browse files
Finished SemiSupervised tests #175
1 parent 936f81f commit 14c92d7

8 files changed

+140
-57
lines changed

semisupervised/CoTraining.py

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import numpy as np
1111
from sklearn.naive_bayes import GaussianNB
1212
from sklearn.preprocessing import LabelEncoder
13+
1314
from .utils import split
1415

1516

@@ -45,13 +46,10 @@ def __init__(self, p=1, n=3, k=30, u=75, random_state=None,
4546
self.h1, self.h2 = configs
4647

4748
def fit(self, samples, y):
48-
labeled, u, y = split(samples, y)
49-
50-
if len(labeled) != len(y):
51-
raise ValueError(
52-
f'The dimension of the labeled data must be the same as the '
53-
f'number of labels given. {len(labeled)} != {len(y)}'
54-
)
49+
try:
50+
labeled, u, y = split(samples, y)
51+
except IndexError:
52+
raise ValueError('Dimensions do not match.')
5553

5654
le = LabelEncoder()
5755
le.fit(y)
@@ -61,8 +59,12 @@ def fit(self, samples, y):
6159
self.size_x1 = ceil(len(labeled[0]) / 2)
6260

6361
rng = np.random.default_rng()
64-
u_random_index = rng.choice(len(u), size=floor(self.u),
65-
replace=False, shuffle=False)
62+
try:
63+
u_random_index = rng.choice(len(u), size=floor(self.u),
64+
replace=False, shuffle=False)
65+
except ValueError:
66+
raise ValueError('The model was incorrectly parametrized, '
67+
'total between _p_ and _u_ is to big.')
6668

6769
u_prime = u[u_random_index]
6870
u1, u2 = np.array_split(u_prime, 2, axis=1)
@@ -105,17 +107,16 @@ def fit(self, samples, y):
105107
u_prime = np.delete(u_prime, old_indexes, axis=0)
106108

107109
u = np.delete(u, u_random_index, axis=0)
110+
108111
try:
109112
u_random_index = rng.choice(len(u),
110113
size=2 * self.p + 2 * self.n,
111114
replace=False, shuffle=False)
112115
except ValueError:
113-
print('The model was incorrectly parametrized, k is to big.')
114-
try:
115-
u_prime = np.concatenate((u_prime, u[u_random_index]))
116-
except IndexError:
117-
print('The model was incorrectly parametrized, there are not '
118-
'enough unlabeled samples.')
116+
raise ValueError('The model was incorrectly parametrized, '
117+
'total between _p_ and _u_ is to big.')
118+
119+
u_prime = np.concatenate((u_prime, u[u_random_index]))
119120

120121
def predict(self, samples):
121122
x1, x2 = np.array_split(samples, 2, axis=1)

semisupervised/DemocraticCoLearning.py

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -69,18 +69,15 @@ def __init__(self, random_state=None,
6969
self.h1, self.h2, self.h3 = configs
7070

7171
def fit(self, samples, y):
72-
labeled, u, y = split(samples, y)
73-
74-
if len(labeled) != len(y):
75-
raise ValueError(
76-
f'The dimension of the labeled data must be the same as the '
77-
f'number of labels given. {len(labeled)} != {len(y)}'
78-
)
72+
try:
73+
labeled, u, y = split(samples, y)
74+
except IndexError:
75+
raise ValueError('Dimensions do not match.')
7976

8077
le = LabelEncoder()
8178
le.fit(y)
8279
y = le.transform(y)
83-
self.n_labels = max(np.unique(y))+1
80+
self.n_labels = max(np.unique(y)) + 1
8481

8582
unlabeled_data = u
8683
self.n_attributes = len(labeled[0])

semisupervised/DensityPeaks.py

Lines changed: 58 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
# @Filename: DensityPeaks.py
44
# @Author: Daniel Puente Ramírez
55
# @Time: 5/3/22 09:55
6-
# @Version: 3.1
6+
# @Version: 4.0
77

88
import math
99
from collections import defaultdict
@@ -15,6 +15,9 @@
1515
from sklearn.semi_supervised import SelfTrainingClassifier
1616
from sklearn.svm import SVC
1717

18+
from instance_selection import ENN
19+
from .utils import split
20+
1821

1922
class STDPNF:
2023
"""
@@ -38,6 +41,8 @@ def __init__(self,
3841
anormal=True,
3942
filtering=False,
4043
classifier=None,
44+
classifier_params=None,
45+
filter_method=None
4146
):
4247
"""Semi Supervised Algorithm based on Density Peaks."""
4348
self.dc = dc
@@ -49,7 +54,19 @@ def __init__(self,
4954
self.distance_threshold = distance_threshold
5055
self.anormal = anormal
5156
self.filtering = filtering
52-
self.classifier = classifier
57+
if classifier is not None:
58+
if isinstance(classifier_params, dict):
59+
self.classifier = classifier(**classifier_params)
60+
else:
61+
self.classifier = classifier()
62+
else:
63+
self.classifier = None
64+
if filter_method is not None and filter_method != 'ENANE':
65+
self.filter = filter_method()
66+
elif isinstance(filter_method, str) and filter_method == 'ENANE':
67+
self.filter = filter_method
68+
else:
69+
self.filter = None
5370

5471
def __build_distance(self):
5572
"""
@@ -87,7 +104,7 @@ def __auto_select_dc(self):
87104
while True:
88105
nneighs = sum(
89106
[1 for v in self.distances.values() if v < dc]) / self.n_id ** 2
90-
if 0.01 <= nneighs <= 0.002:
107+
if 0.01 <= nneighs <= 0.02:
91108
break
92109
# binary search
93110
if nneighs < 0.01:
@@ -333,18 +350,46 @@ def _fit_stdpnf(self):
333350

334351
while count <= max(self.order.values()):
335352
unlabeled_rows = self.structure_stdnpf.loc[self.structure_stdnpf[
336-
'label'] == -1].\
353+
'label'] == -1]. \
337354
index.to_list()
338355
unlabeled_indexes = []
339356
for row in unlabeled_rows:
340357
if self.order[row] == count:
341358
unlabeled_indexes.append(row)
342359

343-
filtered_indexes, filtered_labels = self.__enane(
344-
unlabeled_indexes, nan, lambda_param)
360+
if isinstance(self.filter, str) and self.filter == 'ENANE':
361+
filtered_indexes, filtered_labels = self.__enane(
362+
unlabeled_indexes, nan, lambda_param)
363+
self.structure_stdnpf.at[filtered_indexes, 'label'] = \
364+
filtered_labels
345365

346-
self.structure_stdnpf.at[filtered_indexes, 'label'] = \
347-
filtered_labels
366+
else:
367+
labeled_data = self.structure_stdnpf.loc[self.structure_stdnpf[
368+
'label'] != -1]
369+
complete = labeled_data['sample']
370+
complete_y = labeled_data['label']
371+
372+
if isinstance(self.filter, ENN):
373+
original = pd.DataFrame(self.l)
374+
original_y = pd.DataFrame(self.y)
375+
result, _ = self.filter.filter_original_complete(
376+
original, original_y, complete, complete_y)
377+
else:
378+
result, _ = self.filter.filter(complete, complete_y)
379+
380+
results_to_unlabeled = []
381+
for r in result.to_numpy():
382+
is_in = False
383+
for c in complete:
384+
if np.array_equal(r, c):
385+
is_in = True
386+
if not is_in:
387+
results_to_unlabeled.append(r)
388+
389+
for r in results_to_unlabeled:
390+
self.structure_stdnpf.at[
391+
np.array(self.structure_stdnpf['sample'],
392+
r)]['label'] = -1
348393

349394
labeled_data = self.structure_stdnpf.loc[self.structure_stdnpf[
350395
'label'] != -1]
@@ -358,13 +403,12 @@ def _fit_stdpnf(self):
358403
self.classifier_stdpnf.fit(
359404
labeled_data['sample'].tolist(), labeled_data['label'].tolist())
360405

361-
def fit(self, l, u, y):
406+
def fit(self, samples, y):
362407
"""Fit method."""
363-
if len(l) != len(y):
364-
raise ValueError(
365-
f'The dimension of the labeled data must be the same as the '
366-
f'number of labels given. {len(l)} != {len(y)}'
367-
)
408+
try:
409+
l, u, y = split(samples, y)
410+
except IndexError:
411+
raise ValueError('Dimensions do not match.')
368412

369413
le = LabelEncoder()
370414
le.fit(y)

semisupervised/TriTraining.py

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -66,13 +66,10 @@ def subsample(self, l_t, s):
6666
return Bunch(data=samples, target=targets)
6767

6868
def fit(self, samples, y):
69-
labeled, u, y = split(samples, y)
70-
71-
if len(labeled) != len(y):
72-
raise ValueError(
73-
f'The dimension of the labeled data must be the same as the '
74-
f'number of labels given. {len(labeled)} != {len(y)}'
75-
)
69+
try:
70+
labeled, u, y = split(samples, y)
71+
except IndexError:
72+
raise ValueError('Dimensions do not match.')
7673

7774
le = LabelEncoder()
7875
le.fit(y)

semisupervised/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,9 @@
1616
__author__ = 'Daniel Puente Ramírez'
1717

1818
from .CoTraining import CoTraining
19-
from .TriTraining import TriTraining
2019
from .DemocraticCoLearning import DemocraticCoLearning
2120
from .DensityPeaks import STDPNF
21+
from .TriTraining import TriTraining
2222

2323
__all__ = ["CoTraining",
2424
"TriTraining",

semisupervised/utils/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,4 @@
22

33
__all__ = [
44
"split"
5-
]
5+
]

tests/InstanceSelection.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
def to_dataframe(y):
1818
if not isinstance(y, pd.DataFrame):
1919
return pd.DataFrame(y)
20-
return y
2120

2221

2322
@pytest.fixture

tests/SemiSupervised.py

Lines changed: 55 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,22 +6,18 @@
66

77
import random
88

9-
import pytest
109
import numpy as np
1110
import pandas as pd
11+
import pytest
1212
from sklearn.datasets import load_iris as load_digits
1313
from sklearn.model_selection import train_test_split
14+
from sklearn.neighbors import KNeighborsClassifier
1415

16+
from instance_selection import ENN
1517
from semisupervised import STDPNF, CoTraining, TriTraining, \
1618
DemocraticCoLearning
1719

1820

19-
def to_dataframe(y):
20-
if not isinstance(y, pd.DataFrame):
21-
return pd.DataFrame(y)
22-
return y
23-
24-
2521
@pytest.fixture
2622
def digits_dataset_ss():
2723
x, y = load_digits(return_X_y=True, as_frame=True)
@@ -36,7 +32,7 @@ def digits_dataset_ss():
3632
y_train = pd.DataFrame(y_train)
3733
y_test = pd.DataFrame(y_test)
3834
li = list(set(range(x_train.shape[0])))
39-
unlabeled = random.sample(li, int(x_train.shape[0] * 0.3))
35+
unlabeled = random.sample(li, int(x_train.shape[0] * 0.55))
4036
y_train.loc[unlabeled] = -1
4137

4238
return x_train, x_test, y_train, y_test, opt_labels
@@ -57,19 +53,68 @@ def base(x_train, x_test, y_train, y_test, opt_labels, algorithm, params=None):
5753

5854
def test_co_training(digits_dataset_ss):
5955
x_train, x_test, y_train, y_test, opt_labels = digits_dataset_ss
60-
base(x_train, x_test, y_train, y_test, opt_labels, CoTraining)
56+
base(x_train, x_test, y_train, y_test, opt_labels, CoTraining,
57+
{'p': 1, 'n': 3, 'k': 1, 'u': 7})
58+
base(x_train, x_test, y_train, y_test, opt_labels, CoTraining,
59+
{'p': 1, 'n': 3, 'k': 1, 'u': 7,
60+
'c1': KNeighborsClassifier, 'c1_params': {'n_neighbors': 3},
61+
'c2': KNeighborsClassifier})
62+
63+
with pytest.raises(ValueError):
64+
base(x_train, x_test, y_train, y_test, opt_labels, CoTraining)
65+
66+
with pytest.raises(ValueError):
67+
base(x_train, x_test, y_train, y_test, opt_labels, CoTraining,
68+
{'p': 1, 'n': 3, 'k': 100, 'u': 7})
69+
70+
with pytest.raises(ValueError):
71+
base(x_train, x_test, y_train, y_test, opt_labels, CoTraining,
72+
{'p': 5, 'n': 5, 'k': 100, 'u': 15})
6173

6274

6375
def test_tri_training(digits_dataset_ss):
6476
x_train, x_test, y_train, y_test, opt_labels = digits_dataset_ss
65-
base(x_train, x_test, y_train, y_test, opt_labels, TriTraining)
77+
base(x_train, x_test, y_train, y_test, opt_labels, TriTraining,
78+
{'c1': KNeighborsClassifier, 'c1_params': {'n_neighbors': 3},
79+
'c2': KNeighborsClassifier})
6680

6781

6882
def test_demo_co_learning(digits_dataset_ss):
6983
x_train, x_test, y_train, y_test, opt_labels = digits_dataset_ss
7084
base(x_train, x_test, y_train, y_test, opt_labels, DemocraticCoLearning)
85+
base(x_train, x_test, y_train, y_test, opt_labels, DemocraticCoLearning,
86+
{'c1': KNeighborsClassifier, 'c1_params': {'n_neighbors': 3},
87+
'c2': KNeighborsClassifier})
7188

7289

7390
def test_density_peaks(digits_dataset_ss):
7491
x_train, x_test, y_train, y_test, opt_labels = digits_dataset_ss
7592
base(x_train, x_test, y_train, y_test, opt_labels, STDPNF)
93+
94+
95+
def test_density_peaks_filtering(digits_dataset_ss):
96+
x_train, x_test, y_train, y_test, opt_labels = digits_dataset_ss
97+
with pytest.raises(AttributeError):
98+
base(x_train, x_test, y_train, y_test, opt_labels, STDPNF,
99+
{'filtering': True})
100+
base(x_train, x_test, y_train, y_test, opt_labels, STDPNF,
101+
{'filtering': True, 'filter_method': 'ENANE'})
102+
103+
base(x_train, x_test, y_train, y_test, opt_labels, STDPNF,
104+
{'filtering': True, 'filter_method': ENN, 'dc': 'auto',
105+
'classifier': KNeighborsClassifier})
106+
107+
108+
def test_different_len(digits_dataset_ss):
109+
x, _, y, _, _ = digits_dataset_ss
110+
co = CoTraining()
111+
tri = TriTraining()
112+
demo_co = DemocraticCoLearning()
113+
stdpnf = STDPNF()
114+
115+
models = [co, tri, demo_co, stdpnf]
116+
y = y[:-1]
117+
118+
for model in models:
119+
with pytest.raises(ValueError):
120+
model.fit(x, y)

0 commit comments

Comments
 (0)