kNN Ensemble
In [ ]:
Copied!
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import os, sys
# Colab
try:
import google.colab
IN_COLAB = True
except:
IN_COLAB = False
# Plotting
import matplotlib.pylab as plt
# %matplotlib inline
from matplotlib.pyplot import figure
import seaborn as sns
from IPython.display import display
# Progress
from tqdm import tqdm
################################################################
# Configure system environment
# - Please modify input_dir according to your local enviornment
#
################################################################
cur_dir = os.getcwd()
project_dir = 'machine_learning_examples/cf_ensemble'
if IN_COLAB:
# Run this demo on Google Colab
from google.colab import drive
drive.mount('/content/drive')
# Parameters for data
input_dir = f"/content/drive/MyDrive/Colab Notebooks/{project_dir}"
# /content/drive/MyDrive/Colab Notebooks/machine_learning_examples/data/data-is-life
sys.path.append(input_dir)
else:
input_dir = cur_dir
if input_dir != cur_dir:
sys.path.append(input_dir)
print(f"> Adding {input_dir} to sys path ...")
print(sys.path)
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import os, sys
# Colab
try:
import google.colab
IN_COLAB = True
except:
IN_COLAB = False
# Plotting
import matplotlib.pylab as plt
# %matplotlib inline
from matplotlib.pyplot import figure
import seaborn as sns
from IPython.display import display
# Progress
from tqdm import tqdm
################################################################
# Configure system environment
# - Please modify input_dir according to your local enviornment
#
################################################################
cur_dir = os.getcwd()
project_dir = 'machine_learning_examples/cf_ensemble'
if IN_COLAB:
# Run this demo on Google Colab
from google.colab import drive
drive.mount('/content/drive')
# Parameters for data
input_dir = f"/content/drive/MyDrive/Colab Notebooks/{project_dir}"
# /content/drive/MyDrive/Colab Notebooks/machine_learning_examples/data/data-is-life
sys.path.append(input_dir)
else:
input_dir = cur_dir
if input_dir != cur_dir:
sys.path.append(input_dir)
print(f"> Adding {input_dir} to sys path ...")
print(sys.path)
Mounted at /content/drive > Adding /content/drive/MyDrive/Colab Notebooks/machine_learning_examples/cf_ensemble to sys path ... ['', '/content', '/env/python', '/usr/lib/python37.zip', '/usr/lib/python3.7', '/usr/lib/python3.7/lib-dynload', '/usr/local/lib/python3.7/dist-packages', '/usr/lib/python3/dist-packages', '/usr/local/lib/python3.7/dist-packages/IPython/extensions', '/root/.ipython', '/content/drive/MyDrive/Colab Notebooks/machine_learning_examples/cf_ensemble', '/content/drive/MyDrive/Colab Notebooks/machine_learning_examples/cf_ensemble']
In [ ]:
Copied!
# Tensorflow
import tensorflow as tf
print(tf.__version__)
# import tensorflow_probability as tfp
# tfd = tfp.distributions
from tensorflow import keras
# from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Flatten, Dense, Dropout, Lambda, Embedding
from tensorflow.keras.optimizers import RMSprop
from keras.utils.vis_utils import plot_model
from tensorflow.keras import backend as K
#################################################################
# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_val_predict, cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, StackingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
#################################################################
# CF-ensemble-specific libraries
import utils_stacking as ustk
import utils_classifier as uclf
import utils_sys as usys
import utils_cf as uc
import polarity_models as pmodel
from polarity_models import Polarity
import scipy.sparse as sparse
from utils_sys import highlight
#################################################################
# Misc
import pprint
import tempfile
from typing import Dict, Text
np.set_printoptions(precision=3, edgeitems=5, suppress=True)
# Tensorflow
import tensorflow as tf
print(tf.__version__)
# import tensorflow_probability as tfp
# tfd = tfp.distributions
from tensorflow import keras
# from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Flatten, Dense, Dropout, Lambda, Embedding
from tensorflow.keras.optimizers import RMSprop
from keras.utils.vis_utils import plot_model
from tensorflow.keras import backend as K
#################################################################
# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_val_predict, cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, StackingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
#################################################################
# CF-ensemble-specific libraries
import utils_stacking as ustk
import utils_classifier as uclf
import utils_sys as usys
import utils_cf as uc
import polarity_models as pmodel
from polarity_models import Polarity
import scipy.sparse as sparse
from utils_sys import highlight
#################################################################
# Misc
import pprint
import tempfile
from typing import Dict, Text
np.set_printoptions(precision=3, edgeitems=5, suppress=True)
2.8.0
Generating training data¶
In [ ]:
Copied!
# %matplotlib inline
import data_pipeline as dp
max_class_ratio=0.99
# get the dataset
X0, y0 = dp.generate_imbalanced_data(class_ratio=max_class_ratio, verbose=1)
# %matplotlib inline
import data_pipeline as dp
max_class_ratio=0.99
# get the dataset
X0, y0 = dp.generate_imbalanced_data(class_ratio=max_class_ratio, verbose=1)
> n_classes: 2
[0 1]
> counts:
Counter({0: 4465, 1: 535})
Choosing base classifiers¶
In [ ]:
Copied!
# Create Base Learners
base_learners = [
('RF', RandomForestClassifier(n_estimators= 200,
oob_score = True,
class_weight = "balanced",
random_state = 20,
ccp_alpha = 0.1)),
('KNNC', KNeighborsClassifier(n_neighbors = len(np.unique(y0))
, weights = 'distance')),
# ('SVC', SVC(kernel = 'linear', probability=True,
# class_weight = 'balanced'
# , break_ties = True)),
('GNB', GaussianNB()),
('QDA', QuadraticDiscriminantAnalysis()),
('MLPClassifier', MLPClassifier(alpha=1, max_iter=1000)),
# ('DT', DecisionTreeClassifier(max_depth=5)),
# ('GPC', GaussianProcessClassifier(1.0 * RBF(1.0))),
]
# Create Base Learners
base_learners = [
('RF', RandomForestClassifier(n_estimators= 200,
oob_score = True,
class_weight = "balanced",
random_state = 20,
ccp_alpha = 0.1)),
('KNNC', KNeighborsClassifier(n_neighbors = len(np.unique(y0))
, weights = 'distance')),
# ('SVC', SVC(kernel = 'linear', probability=True,
# class_weight = 'balanced'
# , break_ties = True)),
('GNB', GaussianNB()),
('QDA', QuadraticDiscriminantAnalysis()),
('MLPClassifier', MLPClassifier(alpha=1, max_iter=1000)),
# ('DT', DecisionTreeClassifier(max_depth=5)),
# ('GPC', GaussianProcessClassifier(1.0 * RBF(1.0))),
]
Load pre-trained level-1 data¶
- If it's unclear how to obtain the pre-trained dataset (e.g. probability matrices from base classifiers), please refer back to part 1 or part 2 of this demo series.
In [ ]:
Copied!
import cf_models as cm
tLoadPretrained = False
######################
fold_number = 0
n_iterations = 1
data_dir = os.path.join(input_dir, 'data')
######################
if not tLoadPretrained:
# Use the previously selected base predictors (`base_learners`) to generate the level-1 dataset
R, T, U, L_train, L_test = cm.demo_cf_stacking(input_data=(X0, y0),
input_dir=input_dir, n_iter=n_iterations,
base_learners=base_learners, # <<< base classifiers selected
verbose=1)
else:
R, T, U, L_train, L_test = dp.load_pretrained_level1_data(fold_number=fold_number, verbose=1, data_dir=data_dir)
# Derived quantities
n_train = R.shape[1]
p_threshold = uc.estimateProbThresholds(R, L=L_train, pos_label=1, policy='fmax')
lh = uc.estimateLabels(T, p_th=p_threshold) # We cannot use L_test (cheating), but we have to guesstimate
L = np.hstack((L_train, lh))
X = np.hstack((R, T))
assert len(U) == X.shape[0]
print(f"> shape(R):{R.shape} || shape(T): {T.shape} => shape(X): {X.shape}")
import cf_models as cm
tLoadPretrained = False
######################
fold_number = 0
n_iterations = 1
data_dir = os.path.join(input_dir, 'data')
######################
if not tLoadPretrained:
# Use the previously selected base predictors (`base_learners`) to generate the level-1 dataset
R, T, U, L_train, L_test = cm.demo_cf_stacking(input_data=(X0, y0),
input_dir=input_dir, n_iter=n_iterations,
base_learners=base_learners, # <<< base classifiers selected
verbose=1)
else:
R, T, U, L_train, L_test = dp.load_pretrained_level1_data(fold_number=fold_number, verbose=1, data_dir=data_dir)
# Derived quantities
n_train = R.shape[1]
p_threshold = uc.estimateProbThresholds(R, L=L_train, pos_label=1, policy='fmax')
lh = uc.estimateLabels(T, p_th=p_threshold) # We cannot use L_test (cheating), but we have to guesstimate
L = np.hstack((L_train, lh))
X = np.hstack((R, T))
assert len(U) == X.shape[0]
print(f"> shape(R):{R.shape} || shape(T): {T.shape} => shape(X): {X.shape}")
2.8.0
0%| | 0/1 [00:00<?, ?it/s]
(BaseCF) base est | name: RF, estimator: RandomForestClassifier(ccp_alpha=0.1, class_weight='balanced', n_estimators=200,
oob_score=True, random_state=20)
(BaseCF) base est | name: KNNC, estimator: KNeighborsClassifier(n_neighbors=2, weights='distance')
(BaseCF) base est | name: GNB, estimator: GaussianNB()
(BaseCF) base est | name: QDA, estimator: QuadraticDiscriminantAnalysis()
(BaseCF) base est | name: MLPClassifier, estimator: MLPClassifier(alpha=1, max_iter=1000)
(BaseCF) Base predictors:
[1] RF: RandomForestClassifier(ccp_alpha=0.1, class_weight='balanced', n_estimators=200,
oob_score=True, random_state=20)
[2] QDA: QuadraticDiscriminantAnalysis()
[3] MLPClassifier: MLPClassifier(alpha=1, max_iter=1000)
[4] KNNC: KNeighborsClassifier(n_neighbors=2, weights='distance')
[5] GNB: GaussianNB()
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 5 out of 5 | elapsed: 26.2s finished [Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 5 out of 5 | elapsed: 0.3s finished [Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 5 out of 5 | elapsed: 0.0s finished [Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 5 out of 5 | elapsed: 0.2s finished [Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 5 out of 5 | elapsed: 34.1s finished
[info] Saving X_meta (shape=(3750, 5)) at: /content/drive/MyDrive/Colab Notebooks/machine_learning_examples/cf_ensemble/data/train-0.npz [info] Saving X_meta (shape=(1250, 5)) at: /content/drive/MyDrive/Colab Notebooks/machine_learning_examples/cf_ensemble/data/test-0.npz [info] Saving X_meta (shape=(1250, 5)) at: /content/drive/MyDrive/Colab Notebooks/machine_learning_examples/cf_ensemble/data/test-0.npz [result] 0.11188811188811189 (cf_write) Adding new attribute y: [0 0 0 0 0 ... 0 1 0 0 0] ... (cf_write) Saving X_meta at: /content/drive/MyDrive/Colab Notebooks/machine_learning_examples/cf_ensemble/data/test-0.npz
100%|██████████| 1/1 [01:24<00:00, 84.80s/it]
[info] list of base classifiers: ['RF' 'KNNC' 'GNB' 'QDA' 'MLPClassifier'] ================================================================================ R: Rating/probability matrix for the TRAIN set ================================================================================ > shape(R):(5, 3750) || shape(T): (5, 1250) => shape(X): (5, 5000)
Confidence matrices¶
In [ ]:
Copied!
# import utils_cf as uc
# import polarity_models as pmodel
n_factors = 100
alpha = 100.0
conf_measure = 'brier' # Options: 'brier', 'uniform'
policy_threshold = 'fmax'
Pc, C0, Cw, Cn, *rest = \
uc.evalConfidenceMatrices(R, L_train, alpha=alpha,
p_threshold=p_threshold,
conf_measure=conf_measure, policy_threshold=policy_threshold,
# Optional debug/test parameters
U=U, n_train=n_train, fold_number=fold_number,
is_cascade=True,
verbose=0)
assert C0.shape == R.shape
y_colors = pmodel.verify_colors(Pc) # [log] status: ok
# import utils_cf as uc
# import polarity_models as pmodel
n_factors = 100
alpha = 100.0
conf_measure = 'brier' # Options: 'brier', 'uniform'
policy_threshold = 'fmax'
Pc, C0, Cw, Cn, *rest = \
uc.evalConfidenceMatrices(R, L_train, alpha=alpha,
p_threshold=p_threshold,
conf_measure=conf_measure, policy_threshold=policy_threshold,
# Optional debug/test parameters
U=U, n_train=n_train, fold_number=fold_number,
is_cascade=True,
verbose=0)
assert C0.shape == R.shape
y_colors = pmodel.verify_colors(Pc) # [log] status: ok
(make_cn) Using WEIGHTED confidence matrix to approximate ratings ...
Training CFNet¶
In [ ]:
Copied!
import cf_models as cm
n_users, n_items = R.shape
fold_number = 0
test_size = 0.1
policy_threshold = 'fmax'
conf_measure = 'brier'
n_factors = 100
alpha = 100
lr = 0.001
batch_size = 64
epochs = 200
loss_fn = tf.keras.losses.BinaryCrossentropy() # Options: tf.keras.losses.BinaryCrossentropy(), tf.keras.losses.MeanSquaredError(), ...
# Configure `target_type` (Options: 'generic', 'rating', 'label')
# 1. Choose 'label' if the BCE loss is used (because the CF model in this case attempts to approximates the label encoded in 0 and 1)
# 2. Choose 'rating' if MSE is used (because the CF model in this case approximates the rating, which is a regression problem)
# 3. Choose 'generic' for customized loss function with potentially more complex labeling information where "y_true" is a matrix
#
# Note that you are unlikely need to configure `target_type` because cf_models module has a method that will determine this for you automatically
# target_type = 'label' # if we use BCE, then the model approximates the label
cf_model = cm.get_cfnet_compiled(n_users, n_items, n_factors, loss=loss_fn, lr=lr)
cf_model = cm.training_pipeline(input_model=(cf_model, loss_fn),
input_data=(R, T, U, L_train, L_test),
# Should we combine R and T into a single matrix X? Set to True if so
is_cascade = False, # Set to False here because we attempt to re-estimate T using kNNs
# lh = lh, # Estimated labels by default are the majority vote
# SGD optimization parameters
test_size = test_size,
epochs = epochs,
batch_size=batch_size,
# CF hyperparameters
# n_factors=n_factors, # this is factored into model definition
alpha=alpha,
conf_measure=conf_measure,
# conf_type='Cn', # default sparse confidence matrix (Cn)
# target_type=target_type,
policy_threshold=policy_threshold,
fold_number=fold_number)
import cf_models as cm
n_users, n_items = R.shape
fold_number = 0
test_size = 0.1
policy_threshold = 'fmax'
conf_measure = 'brier'
n_factors = 100
alpha = 100
lr = 0.001
batch_size = 64
epochs = 200
loss_fn = tf.keras.losses.BinaryCrossentropy() # Options: tf.keras.losses.BinaryCrossentropy(), tf.keras.losses.MeanSquaredError(), ...
# Configure `target_type` (Options: 'generic', 'rating', 'label')
# 1. Choose 'label' if the BCE loss is used (because the CF model in this case attempts to approximates the label encoded in 0 and 1)
# 2. Choose 'rating' if MSE is used (because the CF model in this case approximates the rating, which is a regression problem)
# 3. Choose 'generic' for customized loss function with potentially more complex labeling information where "y_true" is a matrix
#
# Note that you are unlikely need to configure `target_type` because cf_models module has a method that will determine this for you automatically
# target_type = 'label' # if we use BCE, then the model approximates the label
cf_model = cm.get_cfnet_compiled(n_users, n_items, n_factors, loss=loss_fn, lr=lr)
cf_model = cm.training_pipeline(input_model=(cf_model, loss_fn),
input_data=(R, T, U, L_train, L_test),
# Should we combine R and T into a single matrix X? Set to True if so
is_cascade = False, # Set to False here because we attempt to re-estimate T using kNNs
# lh = lh, # Estimated labels by default are the majority vote
# SGD optimization parameters
test_size = test_size,
epochs = epochs,
batch_size=batch_size,
# CF hyperparameters
# n_factors=n_factors, # this is factored into model definition
alpha=alpha,
conf_measure=conf_measure,
# conf_type='Cn', # default sparse confidence matrix (Cn)
# target_type=target_type,
policy_threshold=policy_threshold,
fold_number=fold_number)
(make_cn) Using WEIGHTED confidence matrix to approximate ratings ... [info] Confidence matrix type: Cn, target data type: label Epoch 1/200 264/264 [==============================] - 4s 9ms/step - loss: 3.2968 - val_loss: 3.4268 Epoch 2/200 264/264 [==============================] - 1s 5ms/step - loss: 3.8964 - val_loss: 2.7115 Epoch 3/200 264/264 [==============================] - 1s 5ms/step - loss: 5.7728 - val_loss: 4.9412 Epoch 4/200 264/264 [==============================] - 1s 6ms/step - loss: 2.4199 - val_loss: 1.9336 Epoch 5/200 264/264 [==============================] - 1s 5ms/step - loss: 1.2328 - val_loss: 1.7244 Epoch 6/200 264/264 [==============================] - 2s 6ms/step - loss: 0.9934 - val_loss: 1.5784 Epoch 7/200 264/264 [==============================] - 1s 5ms/step - loss: 0.8551 - val_loss: 1.4817 Epoch 8/200 264/264 [==============================] - 1s 6ms/step - loss: 0.7566 - val_loss: 1.4098 Epoch 9/200 264/264 [==============================] - 1s 6ms/step - loss: 0.6829 - val_loss: 1.3618 Epoch 10/200 264/264 [==============================] - 1s 6ms/step - loss: 0.6331 - val_loss: 1.3251 Epoch 11/200 264/264 [==============================] - 1s 5ms/step - loss: 0.5965 - val_loss: 1.3035 Epoch 12/200 264/264 [==============================] - 1s 5ms/step - loss: 0.5663 - val_loss: 1.2753 Epoch 13/200 264/264 [==============================] - 1s 5ms/step - loss: 0.5445 - val_loss: 1.2593 Epoch 14/200 264/264 [==============================] - 2s 7ms/step - loss: 0.5252 - val_loss: 1.2421 Epoch 15/200 264/264 [==============================] - 1s 5ms/step - loss: 0.5111 - val_loss: 1.2230 Epoch 16/200 264/264 [==============================] - 1s 6ms/step - loss: 0.4965 - val_loss: 1.2075 Epoch 17/200 264/264 [==============================] - 1s 5ms/step - loss: 0.4867 - val_loss: 1.2036 Epoch 18/200 264/264 [==============================] - 1s 5ms/step - loss: 0.4735 - val_loss: 1.1880 Epoch 19/200 264/264 [==============================] - 1s 6ms/step - loss: 0.4693 - val_loss: 1.1869 Epoch 20/200 264/264 [==============================] - 1s 5ms/step - loss: 0.4613 - val_loss: 1.1713 Epoch 21/200 264/264 [==============================] - 1s 6ms/step - loss: 0.4513 - val_loss: 1.1555 Epoch 22/200 264/264 [==============================] - 1s 5ms/step - loss: 0.4406 - val_loss: 1.1405 Epoch 23/200 264/264 [==============================] - 1s 5ms/step - loss: 0.4341 - val_loss: 1.1389 Epoch 24/200 264/264 [==============================] - 2s 6ms/step - loss: 0.4273 - val_loss: 1.1255 Epoch 25/200 264/264 [==============================] - 1s 6ms/step - loss: 0.4190 - val_loss: 1.1011 Epoch 26/200 264/264 [==============================] - 1s 5ms/step - loss: 0.4070 - val_loss: 1.0893 Epoch 27/200 264/264 [==============================] - 2s 6ms/step - loss: 0.3966 - val_loss: 1.0736 Epoch 28/200 264/264 [==============================] - 1s 5ms/step - loss: 0.3893 - val_loss: 1.0563 Epoch 29/200 264/264 [==============================] - 1s 5ms/step - loss: 0.3797 - val_loss: 1.0431 Epoch 30/200 264/264 [==============================] - 1s 6ms/step - loss: 0.3710 - val_loss: 1.0281 Epoch 31/200 264/264 [==============================] - 1s 5ms/step - loss: 0.3614 - val_loss: 1.0096 Epoch 32/200 264/264 [==============================] - 1s 5ms/step - loss: 0.3513 - val_loss: 0.9945 Epoch 33/200 264/264 [==============================] - 1s 5ms/step - loss: 0.3432 - val_loss: 0.9771 Epoch 34/200 264/264 [==============================] - 2s 6ms/step - loss: 0.3340 - val_loss: 0.9605 Epoch 35/200 264/264 [==============================] - 1s 6ms/step - loss: 0.3268 - val_loss: 0.9467 Epoch 36/200 264/264 [==============================] - 1s 6ms/step - loss: 0.3174 - val_loss: 0.9342 Epoch 37/200 264/264 [==============================] - 1s 6ms/step - loss: 0.3106 - val_loss: 0.9178 Epoch 38/200 264/264 [==============================] - 1s 6ms/step - loss: 0.3022 - val_loss: 0.9035 Epoch 39/200 264/264 [==============================] - 1s 5ms/step - loss: 0.2951 - val_loss: 0.8903 Epoch 40/200 264/264 [==============================] - 1s 5ms/step - loss: 0.2886 - val_loss: 0.8780 Epoch 41/200 264/264 [==============================] - 1s 6ms/step - loss: 0.2827 - val_loss: 0.8666 Epoch 42/200 264/264 [==============================] - 1s 5ms/step - loss: 0.2776 - val_loss: 0.8562 Epoch 43/200 264/264 [==============================] - 1s 5ms/step - loss: 0.2729 - val_loss: 0.8460 Epoch 44/200 264/264 [==============================] - 2s 6ms/step - loss: 0.2684 - val_loss: 0.8362 Epoch 45/200 264/264 [==============================] - 1s 5ms/step - loss: 0.2641 - val_loss: 0.8269 Epoch 46/200 264/264 [==============================] - 1s 6ms/step - loss: 0.2600 - val_loss: 0.8175 Epoch 47/200 264/264 [==============================] - 1s 6ms/step - loss: 0.2561 - val_loss: 0.8085 Epoch 48/200 264/264 [==============================] - 1s 6ms/step - loss: 0.2521 - val_loss: 0.7994 Epoch 49/200 264/264 [==============================] - 2s 6ms/step - loss: 0.2483 - val_loss: 0.7906 Epoch 50/200 264/264 [==============================] - 1s 5ms/step - loss: 0.2445 - val_loss: 0.7816 Epoch 51/200 264/264 [==============================] - 1s 5ms/step - loss: 0.2411 - val_loss: 0.7730 Epoch 52/200 264/264 [==============================] - 1s 5ms/step - loss: 0.2381 - val_loss: 0.7659 Epoch 53/200 264/264 [==============================] - 1s 5ms/step - loss: 0.2351 - val_loss: 0.7571 Epoch 54/200 264/264 [==============================] - 2s 6ms/step - loss: 0.2312 - val_loss: 0.7480 Epoch 55/200 264/264 [==============================] - 1s 6ms/step - loss: 0.2277 - val_loss: 0.7407 Epoch 56/200 264/264 [==============================] - 2s 6ms/step - loss: 0.2245 - val_loss: 0.7316 Epoch 57/200 264/264 [==============================] - 1s 6ms/step - loss: 0.2211 - val_loss: 0.7225 Epoch 58/200 264/264 [==============================] - 1s 5ms/step - loss: 0.2178 - val_loss: 0.7144 Epoch 59/200 264/264 [==============================] - 1s 5ms/step - loss: 0.2143 - val_loss: 0.7066 Epoch 60/200 264/264 [==============================] - 1s 5ms/step - loss: 0.2123 - val_loss: 0.6985 Epoch 61/200 264/264 [==============================] - 1s 5ms/step - loss: 0.2094 - val_loss: 0.6908 Epoch 62/200 264/264 [==============================] - 1s 5ms/step - loss: 0.2056 - val_loss: 0.6835 Epoch 63/200 264/264 [==============================] - 1s 5ms/step - loss: 0.2022 - val_loss: 0.6750 Epoch 64/200 264/264 [==============================] - 1s 6ms/step - loss: 0.1986 - val_loss: 0.6671 Epoch 65/200 264/264 [==============================] - 1s 6ms/step - loss: 0.1957 - val_loss: 0.6597 Epoch 66/200 264/264 [==============================] - 2s 6ms/step - loss: 0.1932 - val_loss: 0.6523 Epoch 67/200 264/264 [==============================] - 1s 5ms/step - loss: 0.1910 - val_loss: 0.6444 Epoch 68/200 264/264 [==============================] - 1s 5ms/step - loss: 0.1877 - val_loss: 0.6385 Epoch 69/200 264/264 [==============================] - 1s 5ms/step - loss: 0.1856 - val_loss: 0.6321 Epoch 70/200 264/264 [==============================] - 1s 5ms/step - loss: 0.1831 - val_loss: 0.6228 Epoch 71/200 264/264 [==============================] - 1s 6ms/step - loss: 0.1787 - val_loss: 0.6150 Epoch 72/200 264/264 [==============================] - 1s 5ms/step - loss: 0.1758 - val_loss: 0.6076 Epoch 73/200 264/264 [==============================] - 1s 5ms/step - loss: 0.1737 - val_loss: 0.6008 Epoch 74/200 264/264 [==============================] - 1s 5ms/step - loss: 0.1708 - val_loss: 0.5942 Epoch 75/200 264/264 [==============================] - 2s 6ms/step - loss: 0.1683 - val_loss: 0.5876 Epoch 76/200 264/264 [==============================] - 2s 6ms/step - loss: 0.1667 - val_loss: 0.5808 Epoch 77/200 264/264 [==============================] - 1s 6ms/step - loss: 0.1649 - val_loss: 0.5745 Epoch 78/200 264/264 [==============================] - 2s 6ms/step - loss: 0.1613 - val_loss: 0.5684 Epoch 79/200 264/264 [==============================] - 2s 6ms/step - loss: 0.1590 - val_loss: 0.5604 Epoch 80/200 264/264 [==============================] - 1s 6ms/step - loss: 0.1558 - val_loss: 0.5538 Epoch 81/200 264/264 [==============================] - 1s 6ms/step - loss: 0.1535 - val_loss: 0.5473 Epoch 82/200 264/264 [==============================] - 1s 6ms/step - loss: 0.1511 - val_loss: 0.5413 Epoch 83/200 264/264 [==============================] - 2s 6ms/step - loss: 0.1495 - val_loss: 0.5360 Epoch 84/200 264/264 [==============================] - 2s 6ms/step - loss: 0.1483 - val_loss: 0.5326 Epoch 85/200 264/264 [==============================] - 2s 6ms/step - loss: 0.1455 - val_loss: 0.5226 Epoch 86/200 264/264 [==============================] - 1s 6ms/step - loss: 0.1425 - val_loss: 0.5167 Epoch 87/200 264/264 [==============================] - 2s 7ms/step - loss: 0.1400 - val_loss: 0.5108 Epoch 88/200 264/264 [==============================] - 2s 6ms/step - loss: 0.1379 - val_loss: 0.5049 Epoch 89/200 264/264 [==============================] - 1s 5ms/step - loss: 0.1358 - val_loss: 0.4988 Epoch 90/200 264/264 [==============================] - 1s 6ms/step - loss: 0.1339 - val_loss: 0.4936 Epoch 91/200 264/264 [==============================] - 2s 6ms/step - loss: 0.1321 - val_loss: 0.4885 Epoch 92/200 264/264 [==============================] - 1s 5ms/step - loss: 0.1303 - val_loss: 0.4847 Epoch 93/200 264/264 [==============================] - 2s 6ms/step - loss: 0.1287 - val_loss: 0.4774 Epoch 94/200 264/264 [==============================] - 1s 6ms/step - loss: 0.1259 - val_loss: 0.4708 Epoch 95/200 264/264 [==============================] - 2s 6ms/step - loss: 0.1237 - val_loss: 0.4653 Epoch 96/200 264/264 [==============================] - 2s 6ms/step - loss: 0.1217 - val_loss: 0.4599 Epoch 97/200 264/264 [==============================] - 2s 6ms/step - loss: 0.1198 - val_loss: 0.4544 Epoch 98/200 264/264 [==============================] - 2s 7ms/step - loss: 0.1181 - val_loss: 0.4495 Epoch 99/200 264/264 [==============================] - 1s 6ms/step - loss: 0.1163 - val_loss: 0.4448 Epoch 100/200 264/264 [==============================] - 2s 6ms/step - loss: 0.1153 - val_loss: 0.4394 Epoch 101/200 264/264 [==============================] - 2s 6ms/step - loss: 0.1133 - val_loss: 0.4342 Epoch 102/200 264/264 [==============================] - 2s 6ms/step - loss: 0.1110 - val_loss: 0.4289 Epoch 103/200 264/264 [==============================] - 1s 6ms/step - loss: 0.1098 - val_loss: 0.4247 Epoch 104/200 264/264 [==============================] - 1s 5ms/step - loss: 0.1076 - val_loss: 0.4193 Epoch 105/200 264/264 [==============================] - 2s 6ms/step - loss: 0.1059 - val_loss: 0.4143 Epoch 106/200 264/264 [==============================] - 1s 5ms/step - loss: 0.1042 - val_loss: 0.4098 Epoch 107/200 264/264 [==============================] - 1s 5ms/step - loss: 0.1027 - val_loss: 0.4054 Epoch 108/200 264/264 [==============================] - 1s 6ms/step - loss: 0.1012 - val_loss: 0.4012 Epoch 109/200 264/264 [==============================] - 1s 5ms/step - loss: 0.0997 - val_loss: 0.3959 Epoch 110/200 264/264 [==============================] - 1s 5ms/step - loss: 0.0982 - val_loss: 0.3916 Epoch 111/200 264/264 [==============================] - 2s 6ms/step - loss: 0.0965 - val_loss: 0.3869 Epoch 112/200 264/264 [==============================] - 1s 6ms/step - loss: 0.0949 - val_loss: 0.3826 Epoch 113/200 264/264 [==============================] - 2s 6ms/step - loss: 0.0935 - val_loss: 0.3780 Epoch 114/200 264/264 [==============================] - 1s 6ms/step - loss: 0.0919 - val_loss: 0.3739 Epoch 115/200 264/264 [==============================] - 2s 6ms/step - loss: 0.0908 - val_loss: 0.3703 Epoch 116/200 264/264 [==============================] - 2s 6ms/step - loss: 0.0894 - val_loss: 0.3657 Epoch 117/200 264/264 [==============================] - 2s 6ms/step - loss: 0.0882 - val_loss: 0.3614 Epoch 118/200 264/264 [==============================] - 2s 6ms/step - loss: 0.0862 - val_loss: 0.3571 Epoch 119/200 264/264 [==============================] - 2s 6ms/step - loss: 0.0849 - val_loss: 0.3534 Epoch 120/200 264/264 [==============================] - 2s 6ms/step - loss: 0.0836 - val_loss: 0.3494 Epoch 121/200 264/264 [==============================] - 1s 6ms/step - loss: 0.0823 - val_loss: 0.3454 Epoch 122/200 264/264 [==============================] - 1s 5ms/step - loss: 0.0812 - val_loss: 0.3421 Epoch 123/200 264/264 [==============================] - 2s 6ms/step - loss: 0.0800 - val_loss: 0.3379 Epoch 124/200 264/264 [==============================] - 1s 5ms/step - loss: 0.0786 - val_loss: 0.3340 Epoch 125/200 264/264 [==============================] - 1s 5ms/step - loss: 0.0775 - val_loss: 0.3302 Epoch 126/200 264/264 [==============================] - 2s 6ms/step - loss: 0.0760 - val_loss: 0.3268 Epoch 127/200 264/264 [==============================] - 1s 5ms/step - loss: 0.0748 - val_loss: 0.3231 Epoch 128/200 264/264 [==============================] - 1s 5ms/step - loss: 0.0736 - val_loss: 0.3194 Epoch 129/200 264/264 [==============================] - 2s 6ms/step - loss: 0.0726 - val_loss: 0.3159 Epoch 130/200 264/264 [==============================] - 2s 6ms/step - loss: 0.0715 - val_loss: 0.3126 Epoch 131/200 264/264 [==============================] - 1s 5ms/step - loss: 0.0703 - val_loss: 0.3090 Epoch 132/200 264/264 [==============================] - 1s 5ms/step - loss: 0.0691 - val_loss: 0.3056 Epoch 133/200 264/264 [==============================] - 1s 6ms/step - loss: 0.0681 - val_loss: 0.3019 Epoch 134/200 264/264 [==============================] - 1s 6ms/step - loss: 0.0670 - val_loss: 0.2991 Epoch 135/200 264/264 [==============================] - 2s 6ms/step - loss: 0.0659 - val_loss: 0.2950 Epoch 136/200 264/264 [==============================] - 2s 6ms/step - loss: 0.0650 - val_loss: 0.2924 Epoch 137/200 264/264 [==============================] - 2s 6ms/step - loss: 0.0638 - val_loss: 0.2891 Epoch 138/200 264/264 [==============================] - 2s 6ms/step - loss: 0.0628 - val_loss: 0.2859 Epoch 139/200 264/264 [==============================] - 2s 6ms/step - loss: 0.0619 - val_loss: 0.2831 Epoch 140/200 264/264 [==============================] - 2s 6ms/step - loss: 0.0609 - val_loss: 0.2802 Epoch 141/200 264/264 [==============================] - 2s 6ms/step - loss: 0.0599 - val_loss: 0.2771 Epoch 142/200 264/264 [==============================] - 2s 6ms/step - loss: 0.0589 - val_loss: 0.2743 Epoch 143/200 264/264 [==============================] - 2s 6ms/step - loss: 0.0581 - val_loss: 0.2712 Epoch 144/200 264/264 [==============================] - 2s 6ms/step - loss: 0.0571 - val_loss: 0.2682 Epoch 145/200 264/264 [==============================] - 2s 6ms/step - loss: 0.0562 - val_loss: 0.2653 Epoch 146/200 264/264 [==============================] - 1s 6ms/step - loss: 0.0553 - val_loss: 0.2631 Epoch 147/200 264/264 [==============================] - 2s 6ms/step - loss: 0.0545 - val_loss: 0.2597 Epoch 148/200 264/264 [==============================] - 1s 5ms/step - loss: 0.0535 - val_loss: 0.2570 Epoch 149/200 264/264 [==============================] - 2s 6ms/step - loss: 0.0527 - val_loss: 0.2542 Epoch 150/200 264/264 [==============================] - 2s 6ms/step - loss: 0.0518 - val_loss: 0.2517 Epoch 151/200 264/264 [==============================] - 2s 6ms/step - loss: 0.0511 - val_loss: 0.2491 Epoch 152/200 264/264 [==============================] - 1s 5ms/step - loss: 0.0502 - val_loss: 0.2465 Epoch 153/200 264/264 [==============================] - 2s 6ms/step - loss: 0.0495 - val_loss: 0.2439 Epoch 154/200 264/264 [==============================] - 1s 6ms/step - loss: 0.0487 - val_loss: 0.2413 Epoch 155/200 264/264 [==============================] - 1s 6ms/step - loss: 0.0478 - val_loss: 0.2386 Epoch 156/200 264/264 [==============================] - 2s 6ms/step - loss: 0.0471 - val_loss: 0.2362 Epoch 157/200 264/264 [==============================] - 2s 6ms/step - loss: 0.0464 - val_loss: 0.2339 Epoch 158/200 264/264 [==============================] - 2s 6ms/step - loss: 0.0456 - val_loss: 0.2316 Epoch 159/200 264/264 [==============================] - 2s 6ms/step - loss: 0.0449 - val_loss: 0.2291 Epoch 160/200 264/264 [==============================] - 1s 5ms/step - loss: 0.0442 - val_loss: 0.2267 Epoch 161/200 264/264 [==============================] - 1s 5ms/step - loss: 0.0435 - val_loss: 0.2244 Epoch 162/200 264/264 [==============================] - 1s 6ms/step - loss: 0.0428 - val_loss: 0.2221 Epoch 163/200 264/264 [==============================] - 2s 6ms/step - loss: 0.0421 - val_loss: 0.2197 Epoch 164/200 264/264 [==============================] - 2s 6ms/step - loss: 0.0414 - val_loss: 0.2176 Epoch 165/200 264/264 [==============================] - 2s 6ms/step - loss: 0.0408 - val_loss: 0.2155 Epoch 166/200 264/264 [==============================] - 2s 6ms/step - loss: 0.0401 - val_loss: 0.2135 Epoch 167/200 264/264 [==============================] - 1s 6ms/step - loss: 0.0395 - val_loss: 0.2114 Epoch 168/200 264/264 [==============================] - 2s 6ms/step - loss: 0.0388 - val_loss: 0.2091 Epoch 169/200 264/264 [==============================] - 1s 6ms/step - loss: 0.0382 - val_loss: 0.2070 Epoch 170/200 264/264 [==============================] - 2s 6ms/step - loss: 0.0376 - val_loss: 0.2052 Epoch 171/200 264/264 [==============================] - 2s 6ms/step - loss: 0.0370 - val_loss: 0.2030 Epoch 172/200 264/264 [==============================] - 1s 6ms/step - loss: 0.0364 - val_loss: 0.2011 Epoch 173/200 264/264 [==============================] - 1s 5ms/step - loss: 0.0358 - val_loss: 0.1991 Epoch 174/200 264/264 [==============================] - 1s 5ms/step - loss: 0.0353 - val_loss: 0.1971 Epoch 175/200 264/264 [==============================] - 1s 5ms/step - loss: 0.0347 - val_loss: 0.1953 Epoch 176/200 264/264 [==============================] - 1s 6ms/step - loss: 0.0342 - val_loss: 0.1935 Epoch 177/200 264/264 [==============================] - 1s 5ms/step - loss: 0.0336 - val_loss: 0.1917 Epoch 178/200 264/264 [==============================] - 2s 6ms/step - loss: 0.0331 - val_loss: 0.1898 Epoch 179/200 264/264 [==============================] - 2s 6ms/step - loss: 0.0325 - val_loss: 0.1880 Epoch 180/200 264/264 [==============================] - 1s 6ms/step - loss: 0.0320 - val_loss: 0.1864 Epoch 181/200 264/264 [==============================] - 1s 6ms/step - loss: 0.0315 - val_loss: 0.1844 Epoch 182/200 264/264 [==============================] - 2s 6ms/step - loss: 0.0310 - val_loss: 0.1829 Epoch 183/200 264/264 [==============================] - 2s 6ms/step - loss: 0.0305 - val_loss: 0.1811 Epoch 184/200 264/264 [==============================] - 2s 6ms/step - loss: 0.0300 - val_loss: 0.1796 Epoch 185/200 264/264 [==============================] - 2s 6ms/step - loss: 0.0295 - val_loss: 0.1777 Epoch 186/200 264/264 [==============================] - 2s 6ms/step - loss: 0.0291 - val_loss: 0.1761 Epoch 187/200 264/264 [==============================] - 2s 6ms/step - loss: 0.0286 - val_loss: 0.1746 Epoch 188/200 264/264 [==============================] - 2s 6ms/step - loss: 0.0281 - val_loss: 0.1729 Epoch 189/200 264/264 [==============================] - 1s 6ms/step - loss: 0.0277 - val_loss: 0.1715 Epoch 190/200 264/264 [==============================] - 2s 6ms/step - loss: 0.0273 - val_loss: 0.1697 Epoch 191/200 264/264 [==============================] - 2s 6ms/step - loss: 0.0268 - val_loss: 0.1682 Epoch 192/200 264/264 [==============================] - 2s 6ms/step - loss: 0.0264 - val_loss: 0.1667 Epoch 193/200 264/264 [==============================] - 1s 6ms/step - loss: 0.0260 - val_loss: 0.1651 Epoch 194/200 264/264 [==============================] - 2s 6ms/step - loss: 0.0256 - val_loss: 0.1638 Epoch 195/200 264/264 [==============================] - 2s 6ms/step - loss: 0.0252 - val_loss: 0.1623 Epoch 196/200 264/264 [==============================] - 1s 6ms/step - loss: 0.0247 - val_loss: 0.1609 Epoch 197/200 264/264 [==============================] - 2s 6ms/step - loss: 0.0244 - val_loss: 0.1596 Epoch 198/200 264/264 [==============================] - 1s 6ms/step - loss: 0.0240 - val_loss: 0.1582 Epoch 199/200 264/264 [==============================] - 1s 6ms/step - loss: 0.0236 - val_loss: 0.1568 Epoch 200/200 264/264 [==============================] - 2s 6ms/step - loss: 0.0232 - val_loss: 0.1556
In [ ]:
Copied!
%load_ext tensorboard
%tensorboard --logdir logs
%load_ext tensorboard
%tensorboard --logdir logs
Output hidden; open in https://colab.research.google.com to view.
Fast K nearest neighbors¶
- scikit learn's KNN does not scale well; use Facebook's faiss library instead
In [ ]:
Copied!
# install openMP (as a prerequisite prior to installing faiss)
!sudo apt-get install libomp-dev
# => doing so allows for "pip install faiss"
# => which then also allows for "import utils_knn"
# install openMP (as a prerequisite prior to installing faiss)
!sudo apt-get install libomp-dev
# => doing so allows for "pip install faiss"
# => which then also allows for "import utils_knn"
Reading package lists... Done Building dependency tree Reading state information... Done The following additional packages will be installed: libomp5 Suggested packages: libomp-doc The following NEW packages will be installed: libomp-dev libomp5 0 upgraded, 2 newly installed, 0 to remove and 39 not upgraded. Need to get 239 kB of archives. After this operation, 804 kB of additional disk space will be used. Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libomp5 amd64 5.0.1-1 [234 kB] Get:2 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libomp-dev amd64 5.0.1-1 [5,088 B] Fetched 239 kB in 1s (419 kB/s) debconf: unable to initialize frontend: Dialog debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 76, <> line 2.) debconf: falling back to frontend: Readline debconf: unable to initialize frontend: Readline debconf: (This frontend requires a controlling tty.) debconf: falling back to frontend: Teletype dpkg-preconfigure: unable to re-open stdin: Selecting previously unselected package libomp5:amd64. (Reading database ... 155455 files and directories currently installed.) Preparing to unpack .../libomp5_5.0.1-1_amd64.deb ... Unpacking libomp5:amd64 (5.0.1-1) ... Selecting previously unselected package libomp-dev. Preparing to unpack .../libomp-dev_5.0.1-1_amd64.deb ... Unpacking libomp-dev (5.0.1-1) ... Setting up libomp5:amd64 (5.0.1-1) ... Setting up libomp-dev (5.0.1-1) ... Processing triggers for libc-bin (2.27-3ubuntu1.3) ... /sbin/ldconfig.real: /usr/local/lib/python3.7/dist-packages/ideep4py/lib/libmkldnn.so.0 is not a symbolic link
In [ ]:
Copied!
# import utils_sys as usys
try:
import faiss
except:
# pip install faiss
usys.install('faiss')
import faiss
import knn_models
# import utils_sys as usys
try:
import faiss
except:
# pip install faiss
usys.install('faiss')
import faiss
import knn_models
In [ ]:
Copied!
from utilities import normalize
import scipy.sparse as sparse
# from sklearn.preprocessing import normalize
class FaissKNN:
def __init__(self, k=5, normalize=False):
self.index = None
self.y = None
self.y_tag = None # other meta data for the label/target such as polarities, colors
self.k = k
self.normalize_input = normalize
def fit(self, X, y):
self.index = faiss.IndexFlatL2(X.shape[1]) # Each x in X is in row-vector format i.e. X has shape (n_instances, n_dim)
# Note: Rating matrix (X), however, is in column-vector format; therefore, we need to remember to take transpose before using it as an input
if self.normalize_input:
X = normalize(X, axis=1) # X is in row-vector format
self.index.add(X.astype(np.float32))
self.y = y
def predict(self, X):
distances, indices = self.index.search(X.astype(np.float32), k=self.k)
# shape(distances): (n_instances, k)
# shape(indices): (n_instances, k)
votes = self.y[indices] # note: shape(votes)=shape(indices)
predictions = np.array([np.argmax(np.bincount(x)) for x in votes])
# np.bincount([1, 1, 1, 0, 1, 0, 0, 0, 1, 1])
# ~> array([4, 6]) because index 0 occurs 4 times, and 1 occurs 6 times
return predictions
def search(self, X):
distances, indices = self.index.search(X.astype(np.float32), k=self.k)
return distances, indices
from utilities import normalize
import scipy.sparse as sparse
# from sklearn.preprocessing import normalize
class FaissKNN:
def __init__(self, k=5, normalize=False):
self.index = None
self.y = None
self.y_tag = None # other meta data for the label/target such as polarities, colors
self.k = k
self.normalize_input = normalize
def fit(self, X, y):
self.index = faiss.IndexFlatL2(X.shape[1]) # Each x in X is in row-vector format i.e. X has shape (n_instances, n_dim)
# Note: Rating matrix (X), however, is in column-vector format; therefore, we need to remember to take transpose before using it as an input
if self.normalize_input:
X = normalize(X, axis=1) # X is in row-vector format
self.index.add(X.astype(np.float32))
self.y = y
def predict(self, X):
distances, indices = self.index.search(X.astype(np.float32), k=self.k)
# shape(distances): (n_instances, k)
# shape(indices): (n_instances, k)
votes = self.y[indices] # note: shape(votes)=shape(indices)
predictions = np.array([np.argmax(np.bincount(x)) for x in votes])
# np.bincount([1, 1, 1, 0, 1, 0, 0, 0, 1, 1])
# ~> array([4, 6]) because index 0 occurs 4 times, and 1 occurs 6 times
return predictions
def search(self, X):
distances, indices = self.index.search(X.astype(np.float32), k=self.k)
return distances, indices
In [ ]:
Copied!
from numpy import linalg as LA
from analyzer import is_sparse
from sklearn.preprocessing import normalize
import data_pipeline as dp
import utils_knn as uknn
from collections import namedtuple
from sklearn.metrics import f1_score
# import polarity_models as pmodel
# from polarity_models import Polarity
def predict_by_knn(model, model_knn, R, T, L_train, L_test, C, Pc, codes={}, pos_label=1, verbose=1):
"""
Parameters
----------
model: An instance of CFNet that has been pre-trained
model_knn: An instance of Faiss KNN model that has been pre-trained
R: probability/rating matrix of the training split
T:
L_train:
L_test:
C:
Pc: color matrix of the training split
"""
if verbose: np.set_printoptions(precision=3, edgeitems=5, suppress=True)
# Convert rating matrices back to typical ML training set format
X_train = R.T
X_test = T.T
# Find kNNs for each test instances in T
distances, knn_indices = model_knn.search(X_test)
N, k = knn_indices.shape # `knn_indices` is a k-by-N matrix, where k as in kNN and N is the sample size
n_users = T.shape[0]
assert N == T.shape[1], f"Size of test set: {T.shape[1]} inconsistent what's inferred from knn indices: {N}"
assert R.shape == Pc.shape
if len(codes) == 0: codes = Polarity.codes
if is_sparse(Pc): Pc = Pc.A #
# Infer true labels (L_train) from color matrix
L_train = pmodel.color_matrix_to_labels(Pc, codes=codes) # True labels for R
n_unreliable_knn_cases = 0
col_user, col_item, col_value = 'user', 'item', 'rating'
Th = np.zeros_like(T, dtype='float32') # Initialize the re-estimated test set (Th) for T
T_knn_best = np.zeros_like(T, dtype='float32')
T_avg = np.zeros_like(T, dtype='float32')
T_masked_avg = np.zeros_like(T, dtype='float32')
Th_reliable = np.zeros_like(T, dtype='float32') # unreliable entries are marked by special number (e.g. 0)
T_pred = {} # keep track of various predictied outputs according to different strategies
T_pred['knn_max'] = []
# kNN top of the top (rank kNNs further by their entropy values, the smaller the better)
# L_knn, top_indices = uknn.estimate_labels_by_rank(model_knn, T, Pc, topn=min(3, k),
# rank_fn=uknn.compute_entropy,
# larger_is_better=False,
# verbose=0)
msg = ''
test_points = np.random.choice(range(N), 10)
for i in tqdm(range(N)): # foreach position in the test split (T)
knn_idx = knn_indices[i] # test point (i)'s k nearest neighbors in R (in terms of their indices)
# knn_idx = top_indices[i]
Pc_i = Pc[:, knn_idx].astype(int) # subset the color matrix at kNN indices
# Method #1 Majority vote: Use the label determined by majority vote within kNNs
L_knn_i = pmodel.color_matrix_to_labels(Pc_i, codes=codes) # kNN's labels
ti_knn_max = np.argmax( np.bincount(L_knn_i) ) # kNN-predicted label by majority vote
# ti_knn_max = L_knn[i]
T_pred['knn_max'].append(ti_knn_max)
# Gather statistics
ni = Pc_i.size # ~ T.size
ntp = np.sum(Pc_i == codes['tp'])
ntn = np.sum(Pc_i == codes['tn'])
nfp = np.sum(Pc_i == codes['fp'])
nfn = np.sum(Pc_i == codes['fn'])
if (ntp+ntn)==0: # None of the base classifiers (users) made any correct predictions within these kNNs
n_unreliable_knn_cases += 1
# [Test]
if verbose > 1:
msg += f"[info] test point index: {i}\n" + '#' * 50 + '\n'
msg += f"> T({i}):\n{T[:, i]}\n"
msg += f"> R({i}):\n{R[:, knn_idx[0]]}\n" # point in R closest to the current test point T[:, i]
msg += f"> Pc_i(shape={Pc_i.shape}):\n{Pc_i}\n"
msg += f"> L_knn(size={len(L_knn)}):\n{L_knn}\n"
msg += f"> label prediction (knn) => {ti_knn_max}\n"
# Method #2 Best uses: foreach base classifier prediction in ti, use the "best" among these kNNs (majority vote followed by restiamte)
max_colors, max_indices = [], []
for u in range(n_users):
color, pos = uknn.most_common_element_and_position(Pc_i[u, :], pos_key_only=True)
max_colors.append(color)
max_indices.append(knn_idx[pos]) # we also want the knn index
X_knn_best = dp.zip_user_item_pairs(T, item_ids=max_indices)
y_knn_best = model.predict(X_knn_best)
T_knn_best[:, i] = np.squeeze(y_knn_best, axis=-1)
# Compute the mask within these kNN part of the training data
M = np.zeros_like(Pc_i) # np.repeat(Li, Pc_i.size).reshape(Pc_i.shape)
M[Pc_i > 0] = 1 # polarity > 0 => correct predictions (either TP or TN) => keep their re-estimated values by setting these entries to 1s
# ... polarity < 0 => incorrect predictions => discard by setting them to 0s
# Get re-estimated values for the kNN (of test instance)
X_knn = dp.make_user_item_pairs(T, item_ids=knn_idx) # structure k-NN in user-item-pair format for CFNet-based models
assert X_knn.shape[0] == Pc_i.size
y_knn = model.predict(X_knn)
T_knn = y_knn.reshape((n_users, len(knn_idx))) # use len(knn_idx) instead of `k` to consider the flexibility of selecting even fewer candidates
# if i == 10: print(f"[test] knn_idx: {knn_idx}"); print(f"[test] X_knn:\n{X_knn}\n"); print(f"[test] T_knn:\n{T_knn}\n")
assert T_knn.shape[1] <= k, f"T_knn[1] == k(NN): {k} but got {T_knn.shape[1]}"
assert T_knn.shape == Pc_i.shape, f"T_knn is a n_users-by-k matrix but got shape: {T_knn.shape}"
# Method #3 Column Average: Use the average across the re-estimated kNNs
ti_knn_avg = np.mean(T_knn, axis=1) # take column-wise average (i.e. for each user, take the average among kNNs)
T_avg[:, i] = ti_knn_avg
# Method #4 Masked Average: Use the reestimated values w.r.t ONLY those with positive polarity (i.e. averaging from TPs or TNs)
eps = 1e-4
ti_knn_masked_avg = (M*T_knn).sum(1)/(M.sum(1)+eps) # take average from non-zero entries only
T_masked_avg[:, i] = ti_knn_masked_avg
# Method #5 Adjusted Masked Average: Consider degenerative cases in which, for a given base classifier,
# NONE of its predictions in these kNNs are correct
# - It's possible that some classifiers never made correct predictions in the context of these kNNs
# - For these rows, their values in M are all zeros
# `- Set a default value if that's the case (e.g. average)
Th[:, i] = np.where(ti_knn_masked_avg == 0, ti_knn_avg, ti_knn_masked_avg)
# Method #6: Mark unreliable entries by -1 (and apply a post-hoc method to Th); post-hoc method is yet to be defined
Th_reliable[:, i] = np.where(ti_knn_masked_avg == 0, -1, ti_knn_masked_avg)
T_pred['T_knn_best'] = T_knn_best # best users
T_pred['T_avg'] = T_avg # average
T_pred['T_masked_avg'] = T_masked_avg # masked average
T_pred['Th'] = Th # adjusted masked average
T_pred['Th_reliable'] = Th_reliable # -1
if verbose:
print(f"[info] Number of unreliable kNN cases: {n_unreliable_knn_cases}")
return T_pred
# Normalize each data point so that they have a unit length
# R = normalize(R, axis=0, norm='l2')
# T = normalize(T, axis=0, norm='l2')
# test_points = np.random.choice(range(T.shape[1]), 10)
# for t in test_points:
# # print(f"norm({t})={LA.norm(T[:, t], 2)}")
# assert np.allclose(1.0, LA.norm(T[:, t], 2))
X_train = R.T
X_test = T.T
fknn = FaissKNN(k=10)
fknn.fit(X_train, L_train) # Note: X_train = np.tranpose(R)
assert Pc.shape == R.shape
assert Cw.shape == R.shape
assert len(L_train) == R.shape[1]
T_pred = predict_by_knn(cf_model, fknn,
R, T, L_train, L_test, Cw, Pc,
codes=Polarity.codes, pos_label=1, verbose=1)
# A CF ensemble dataset consists of several parts: original (rating) matrix, re-estimated matrix, ...
# - namedtuple comes in handy
DataSet = namedtuple("DataSet", "X, Xh, L") # declare a `DataSet` type with the attributes: X, Xh and L
Hyperparams = namedtuple("Hyperparams", "alpha, n_factors, policy_threshold, conf_measure")
# The objects associated with traing split, hyperparameters are invariant across different prediction strategies
####################################################
Rh, _ = cm.reestimate(cf_model, R) # We still use cf_model alone to reestimate Rh (no kNN involved)
meta = Hyperparams(policy_threshold=policy_threshold,
conf_measure=conf_measure,
alpha=alpha, n_factors=n_factors)
train_split = DataSet(R, Rh, L_train)
####################################################
test_split = DataSet(T, T_pred['T_avg'], L_test) # Seems to have an advantage over the other strategies
highlight(f"(kNN) Average: knn-reestimate the entire T with learned latent factors")
cm.analyze_reestimated_matrices(train_split, test_split, meta=meta, include_stacking=True)
test_split = DataSet(T, T_pred['T_masked_avg'], L_test)
highlight(f"(kNN) Masked Average: kNN-reestimate T using ONLY reliable entries")
cm.analyze_reestimated_matrices(train_split, test_split, meta=meta, include_stacking=True)
test_split = DataSet(T, T_pred['Th'], L_test)
highlight(f"(kNN) Adjusted Masked Average: kNN-reestimate T via 'interpolation'")
# lh, lh_new, p_threshold, p_threshold_new = \
cm.analyze_reestimated_matrices(train_split, test_split, meta=meta, include_stacking=True)
test_split = DataSet(T, T_pred['T_knn_best'], L_test) # Seems to have an advantage over the other strategies
highlight(f"(kNN) Best Users: knn-reestimate T with learned latent factors BUT choose the best classifier predictions among these kNNs")
cm.analyze_reestimated_matrices(train_split, test_split, meta=meta, include_stacking=True)
highlight(f"(kNN) Prediction via majority vote within kNNs (not recommended)")
perf_score = f1_score(test_split.L, T_pred['knn_max'])
print(f'[result] F1 score: {perf_score}')
from numpy import linalg as LA
from analyzer import is_sparse
from sklearn.preprocessing import normalize
import data_pipeline as dp
import utils_knn as uknn
from collections import namedtuple
from sklearn.metrics import f1_score
# import polarity_models as pmodel
# from polarity_models import Polarity
def predict_by_knn(model, model_knn, R, T, L_train, L_test, C, Pc, codes={}, pos_label=1, verbose=1):
"""
Parameters
----------
model: An instance of CFNet that has been pre-trained
model_knn: An instance of Faiss KNN model that has been pre-trained
R: probability/rating matrix of the training split
T:
L_train:
L_test:
C:
Pc: color matrix of the training split
"""
if verbose: np.set_printoptions(precision=3, edgeitems=5, suppress=True)
# Convert rating matrices back to typical ML training set format
X_train = R.T
X_test = T.T
# Find kNNs for each test instances in T
distances, knn_indices = model_knn.search(X_test)
N, k = knn_indices.shape # `knn_indices` is a k-by-N matrix, where k as in kNN and N is the sample size
n_users = T.shape[0]
assert N == T.shape[1], f"Size of test set: {T.shape[1]} inconsistent what's inferred from knn indices: {N}"
assert R.shape == Pc.shape
if len(codes) == 0: codes = Polarity.codes
if is_sparse(Pc): Pc = Pc.A #
# Infer true labels (L_train) from color matrix
L_train = pmodel.color_matrix_to_labels(Pc, codes=codes) # True labels for R
n_unreliable_knn_cases = 0
col_user, col_item, col_value = 'user', 'item', 'rating'
Th = np.zeros_like(T, dtype='float32') # Initialize the re-estimated test set (Th) for T
T_knn_best = np.zeros_like(T, dtype='float32')
T_avg = np.zeros_like(T, dtype='float32')
T_masked_avg = np.zeros_like(T, dtype='float32')
Th_reliable = np.zeros_like(T, dtype='float32') # unreliable entries are marked by special number (e.g. 0)
T_pred = {} # keep track of various predictied outputs according to different strategies
T_pred['knn_max'] = []
# kNN top of the top (rank kNNs further by their entropy values, the smaller the better)
# L_knn, top_indices = uknn.estimate_labels_by_rank(model_knn, T, Pc, topn=min(3, k),
# rank_fn=uknn.compute_entropy,
# larger_is_better=False,
# verbose=0)
msg = ''
test_points = np.random.choice(range(N), 10)
for i in tqdm(range(N)): # foreach position in the test split (T)
knn_idx = knn_indices[i] # test point (i)'s k nearest neighbors in R (in terms of their indices)
# knn_idx = top_indices[i]
Pc_i = Pc[:, knn_idx].astype(int) # subset the color matrix at kNN indices
# Method #1 Majority vote: Use the label determined by majority vote within kNNs
L_knn_i = pmodel.color_matrix_to_labels(Pc_i, codes=codes) # kNN's labels
ti_knn_max = np.argmax( np.bincount(L_knn_i) ) # kNN-predicted label by majority vote
# ti_knn_max = L_knn[i]
T_pred['knn_max'].append(ti_knn_max)
# Gather statistics
ni = Pc_i.size # ~ T.size
ntp = np.sum(Pc_i == codes['tp'])
ntn = np.sum(Pc_i == codes['tn'])
nfp = np.sum(Pc_i == codes['fp'])
nfn = np.sum(Pc_i == codes['fn'])
if (ntp+ntn)==0: # None of the base classifiers (users) made any correct predictions within these kNNs
n_unreliable_knn_cases += 1
# [Test]
if verbose > 1:
msg += f"[info] test point index: {i}\n" + '#' * 50 + '\n'
msg += f"> T({i}):\n{T[:, i]}\n"
msg += f"> R({i}):\n{R[:, knn_idx[0]]}\n" # point in R closest to the current test point T[:, i]
msg += f"> Pc_i(shape={Pc_i.shape}):\n{Pc_i}\n"
msg += f"> L_knn(size={len(L_knn)}):\n{L_knn}\n"
msg += f"> label prediction (knn) => {ti_knn_max}\n"
# Method #2 Best uses: foreach base classifier prediction in ti, use the "best" among these kNNs (majority vote followed by restiamte)
max_colors, max_indices = [], []
for u in range(n_users):
color, pos = uknn.most_common_element_and_position(Pc_i[u, :], pos_key_only=True)
max_colors.append(color)
max_indices.append(knn_idx[pos]) # we also want the knn index
X_knn_best = dp.zip_user_item_pairs(T, item_ids=max_indices)
y_knn_best = model.predict(X_knn_best)
T_knn_best[:, i] = np.squeeze(y_knn_best, axis=-1)
# Compute the mask within these kNN part of the training data
M = np.zeros_like(Pc_i) # np.repeat(Li, Pc_i.size).reshape(Pc_i.shape)
M[Pc_i > 0] = 1 # polarity > 0 => correct predictions (either TP or TN) => keep their re-estimated values by setting these entries to 1s
# ... polarity < 0 => incorrect predictions => discard by setting them to 0s
# Get re-estimated values for the kNN (of test instance)
X_knn = dp.make_user_item_pairs(T, item_ids=knn_idx) # structure k-NN in user-item-pair format for CFNet-based models
assert X_knn.shape[0] == Pc_i.size
y_knn = model.predict(X_knn)
T_knn = y_knn.reshape((n_users, len(knn_idx))) # use len(knn_idx) instead of `k` to consider the flexibility of selecting even fewer candidates
# if i == 10: print(f"[test] knn_idx: {knn_idx}"); print(f"[test] X_knn:\n{X_knn}\n"); print(f"[test] T_knn:\n{T_knn}\n")
assert T_knn.shape[1] <= k, f"T_knn[1] == k(NN): {k} but got {T_knn.shape[1]}"
assert T_knn.shape == Pc_i.shape, f"T_knn is a n_users-by-k matrix but got shape: {T_knn.shape}"
# Method #3 Column Average: Use the average across the re-estimated kNNs
ti_knn_avg = np.mean(T_knn, axis=1) # take column-wise average (i.e. for each user, take the average among kNNs)
T_avg[:, i] = ti_knn_avg
# Method #4 Masked Average: Use the reestimated values w.r.t ONLY those with positive polarity (i.e. averaging from TPs or TNs)
eps = 1e-4
ti_knn_masked_avg = (M*T_knn).sum(1)/(M.sum(1)+eps) # take average from non-zero entries only
T_masked_avg[:, i] = ti_knn_masked_avg
# Method #5 Adjusted Masked Average: Consider degenerative cases in which, for a given base classifier,
# NONE of its predictions in these kNNs are correct
# - It's possible that some classifiers never made correct predictions in the context of these kNNs
# - For these rows, their values in M are all zeros
# `- Set a default value if that's the case (e.g. average)
Th[:, i] = np.where(ti_knn_masked_avg == 0, ti_knn_avg, ti_knn_masked_avg)
# Method #6: Mark unreliable entries by -1 (and apply a post-hoc method to Th); post-hoc method is yet to be defined
Th_reliable[:, i] = np.where(ti_knn_masked_avg == 0, -1, ti_knn_masked_avg)
T_pred['T_knn_best'] = T_knn_best # best users
T_pred['T_avg'] = T_avg # average
T_pred['T_masked_avg'] = T_masked_avg # masked average
T_pred['Th'] = Th # adjusted masked average
T_pred['Th_reliable'] = Th_reliable # -1
if verbose:
print(f"[info] Number of unreliable kNN cases: {n_unreliable_knn_cases}")
return T_pred
# Normalize each data point so that they have a unit length
# R = normalize(R, axis=0, norm='l2')
# T = normalize(T, axis=0, norm='l2')
# test_points = np.random.choice(range(T.shape[1]), 10)
# for t in test_points:
# # print(f"norm({t})={LA.norm(T[:, t], 2)}")
# assert np.allclose(1.0, LA.norm(T[:, t], 2))
X_train = R.T
X_test = T.T
fknn = FaissKNN(k=10)
fknn.fit(X_train, L_train) # Note: X_train = np.tranpose(R)
assert Pc.shape == R.shape
assert Cw.shape == R.shape
assert len(L_train) == R.shape[1]
T_pred = predict_by_knn(cf_model, fknn,
R, T, L_train, L_test, Cw, Pc,
codes=Polarity.codes, pos_label=1, verbose=1)
# A CF ensemble dataset consists of several parts: original (rating) matrix, re-estimated matrix, ...
# - namedtuple comes in handy
DataSet = namedtuple("DataSet", "X, Xh, L") # declare a `DataSet` type with the attributes: X, Xh and L
Hyperparams = namedtuple("Hyperparams", "alpha, n_factors, policy_threshold, conf_measure")
# The objects associated with traing split, hyperparameters are invariant across different prediction strategies
####################################################
Rh, _ = cm.reestimate(cf_model, R) # We still use cf_model alone to reestimate Rh (no kNN involved)
meta = Hyperparams(policy_threshold=policy_threshold,
conf_measure=conf_measure,
alpha=alpha, n_factors=n_factors)
train_split = DataSet(R, Rh, L_train)
####################################################
test_split = DataSet(T, T_pred['T_avg'], L_test) # Seems to have an advantage over the other strategies
highlight(f"(kNN) Average: knn-reestimate the entire T with learned latent factors")
cm.analyze_reestimated_matrices(train_split, test_split, meta=meta, include_stacking=True)
test_split = DataSet(T, T_pred['T_masked_avg'], L_test)
highlight(f"(kNN) Masked Average: kNN-reestimate T using ONLY reliable entries")
cm.analyze_reestimated_matrices(train_split, test_split, meta=meta, include_stacking=True)
test_split = DataSet(T, T_pred['Th'], L_test)
highlight(f"(kNN) Adjusted Masked Average: kNN-reestimate T via 'interpolation'")
# lh, lh_new, p_threshold, p_threshold_new = \
cm.analyze_reestimated_matrices(train_split, test_split, meta=meta, include_stacking=True)
test_split = DataSet(T, T_pred['T_knn_best'], L_test) # Seems to have an advantage over the other strategies
highlight(f"(kNN) Best Users: knn-reestimate T with learned latent factors BUT choose the best classifier predictions among these kNNs")
cm.analyze_reestimated_matrices(train_split, test_split, meta=meta, include_stacking=True)
highlight(f"(kNN) Prediction via majority vote within kNNs (not recommended)")
perf_score = f1_score(test_split.L, T_pred['knn_max'])
print(f'[result] F1 score: {perf_score}')
100%|██████████| 1250/1250 [02:31<00:00, 8.24it/s]
[info] Number of unreliable kNN cases: 8
================================================================================ (kNN) Average: knn-reestimate the entire T with learned latent factors ================================================================================ [info] From R to Rh, delta(Frobenius norm)= 76.3069947781743 [info] From T to Th, delta(Frobenius norm)= 40.07294361077488 [info] From `p_threshold(R)` to `p_threshold(Rh)`, delta(2-norm)= 1.7429831143509626 ... Original p_threshold: [0.499 0. 0.008 0. 0.072] ... New p_threshold: [1. 1. 0.957 0.696 0.707] [info] How different are lh and lh_new? 0.46 [result] Majority vote: F1 score with the original T: 0.20470262793914248 [result] Majority vote: F1 score with re-estimated Th using original p_threshold: 0.19364161849710984 [result] Majority vote: F1 score with re-estimated Th: 0.16216216216216214 [result] Stacking: F1 score with the original T: 0.125 [result] Stacking: F1 score with re-estimated Th: 0.18666666666666668 [result] Best settings: lh_maxvote, score: 0.20470262793914248 ================================================================================ (kNN) Masked Average: kNN-reestimate T using ONLY reliable entries ================================================================================ [info] From R to Rh, delta(Frobenius norm)= 76.3069947781743 [info] From T to Th, delta(Frobenius norm)= 42.30971377472983 [info] From `p_threshold(R)` to `p_threshold(Rh)`, delta(2-norm)= 1.7429831143509626 ... Original p_threshold: [0.499 0. 0.008 0. 0.072] ... New p_threshold: [1. 1. 0.957 0.696 0.707] [info] How different are lh and lh_new? 0.4384 [result] Majority vote: F1 score with the original T: 0.20470262793914248 [result] Majority vote: F1 score with re-estimated Th using original p_threshold: 0.21865596790371114 [result] Majority vote: F1 score with re-estimated Th: 0.2057142857142857 [result] Stacking: F1 score with the original T: 0.125 [result] Stacking: F1 score with re-estimated Th: 0.2 [result] Best settings: lh2_maxvote_pth_unadjusted, score: 0.21865596790371114 ================================================================================ (kNN) Adjusted Masked Average: kNN-reestimate T via 'interpolation' ================================================================================ [info] From R to Rh, delta(Frobenius norm)= 76.3069947781743 [info] From T to Th, delta(Frobenius norm)= 46.177436660954605 [info] From `p_threshold(R)` to `p_threshold(Rh)`, delta(2-norm)= 1.7429831143509626 ... Original p_threshold: [0.499 0. 0.008 0. 0.072] ... New p_threshold: [1. 1. 0.957 0.696 0.707] [info] How different are lh and lh_new? 0.4384 [result] Majority vote: F1 score with the original T: 0.20470262793914248 [result] Majority vote: F1 score with re-estimated Th using original p_threshold: 0.19364161849710984 [result] Majority vote: F1 score with re-estimated Th: 0.2057142857142857 [result] Stacking: F1 score with the original T: 0.125 [result] Stacking: F1 score with re-estimated Th: 0.2 [result] Best settings: lh2_maxvote_pth_adjusted, score: 0.2057142857142857 ================================================================================ (kNN) Best Users: knn-reestimate T with learned latent factors BUT choose the best classifier predictions among these kNNs ================================================================================ [info] From R to Rh, delta(Frobenius norm)= 76.3069947781743 [info] From T to Th, delta(Frobenius norm)= 46.48002205715579 [info] From `p_threshold(R)` to `p_threshold(Rh)`, delta(2-norm)= 1.7429831143509626 ... Original p_threshold: [0.499 0. 0.008 0. 0.072] ... New p_threshold: [1. 1. 0.957 0.696 0.707] [info] How different are lh and lh_new? 0.4568 [result] Majority vote: F1 score with the original T: 0.20470262793914248 [result] Majority vote: F1 score with re-estimated Th using original p_threshold: 0.19364161849710984 [result] Majority vote: F1 score with re-estimated Th: 0.2271762208067941 [result] Stacking: F1 score with the original T: 0.125 [result] Stacking: F1 score with re-estimated Th: 0.19889502762430938 [result] Best settings: lh2_maxvote_pth_adjusted, score: 0.2271762208067941 ================================================================================ (kNN) Prediction via majority vote within kNNs (not recommended) ================================================================================ [result] F1 score: 0.18666666666666668
Error Analysis¶
- Let's observe the color patterns associated with the positive exmaples (the minority class)
In [ ]:
Copied!
from utils_knn import estimate_labels_by_rank, compute_entropy
assert Pc.shape == R.shape
assert T.shape[0] == Pc.shape[0]
topn = 3
# fknn = FaissKNN(k=10)
# fknn.fit(X_train, L_train)
lh, top_indices = estimate_labels_by_rank(fknn, T, Pc, topn=topn, rank_fn=compute_entropy,
larger_is_better=False,
verbose=2)
assert np.array(top_indices).shape[0] == T.shape[1]
assert np.array(top_indices).shape[1] == topn
assert len(lh) == T.shape[1]
from utils_knn import estimate_labels_by_rank, compute_entropy
assert Pc.shape == R.shape
assert T.shape[0] == Pc.shape[0]
topn = 3
# fknn = FaissKNN(k=10)
# fknn.fit(X_train, L_train)
lh, top_indices = estimate_labels_by_rank(fknn, T, Pc, topn=topn, rank_fn=compute_entropy,
larger_is_better=False,
verbose=2)
assert np.array(top_indices).shape[0] == T.shape[1]
assert np.array(top_indices).shape[1] == topn
assert len(lh) == T.shape[1]
[info] Pc_592: [[-2 -2 -2 -2 -2 -2 2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 2 -2 -2 -2] [ 1 1 1 1 1 1 -1 1 1 1] [-2 -2 -2 -2 -2 -2 2 -2 -2 -2]] [info] sorted_knn_i (n=3): [(0.7219280948873623, 0), (0.7219280948873623, 1), (0.7219280948873623, 2)] [info] top_knn_i: [3114, 1658, 640] [info] L_knn(n=3): [0 0 0] ..... top_knn_ij: [0, 1, 2] ..... Pc_592 local: [[-2 -2 -2] [-2 -2 -2] [-2 -2 -2] [ 1 1 1] [-2 -2 -2]] [info] Pc_650: [[ 2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [ 2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [ 2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [-1 1 1 1 1 1 1 1 1 1] [-1 1 1 1 1 1 1 1 1 1]] [info] sorted_knn_i (n=3): [(0.9709505944546688, 0), (0.9709505944546688, 1), (0.9709505944546688, 2)] [info] top_knn_i: [3458, 412, 617] [info] L_knn(n=3): [1 0 0] ..... top_knn_ij: [0, 1, 2] ..... Pc_650 local: [[ 2 -2 -2] [ 2 -2 -2] [ 2 -2 -2] [-1 1 1] [-1 1 1]] [info] Pc_720: [[-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [ 1 1 1 -2 1 1 1 1 1 1] [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]] [info] sorted_knn_i (n=3): [(0.0, 3), (0.7219280948873623, 0), (0.7219280948873623, 1)] [info] top_knn_i: [3570, 3101, 1657] [info] L_knn(n=3): [0 0 0] ..... top_knn_ij: [3, 0, 1] ..... Pc_720 local: [[-2 -2 -2] [-2 -2 -2] [-2 -2 -2] [-2 1 1] [-2 -2 -2]] [info] Pc_729: [[-2 -2 -2 2 -2 -2 -2 -2 -2 -2] [-2 -2 -2 2 -2 -2 -2 -2 -2 -2] [-2 -2 -2 2 -2 -2 -2 -2 -2 -2] [-2 -2 -2 2 -2 -2 -2 -2 -2 -2] [ 1 1 1 -1 1 1 1 -2 -2 1]] [info] sorted_knn_i (n=3): [(0.0, 7), (0.0, 8), (0.7219280948873623, 0)] [info] top_knn_i: [807, 548, 775] [info] L_knn(n=3): [0 0 0] ..... top_knn_ij: [7, 8, 0] ..... Pc_729 local: [[-2 -2 -2] [-2 -2 -2] [-2 -2 -2] [-2 -2 -2] [-2 -2 1]] [info] Pc_756: [[-2 2 -2 -2 2 -2 -2 -2 2 -2] [-2 2 -2 -2 2 -2 -2 -2 2 -2] [-2 2 -2 -2 2 -2 -2 -2 2 -2] [ 1 -1 1 1 -1 1 1 1 -1 -2] [-2 2 -2 -2 2 -2 -2 -2 2 -2]] [info] sorted_knn_i (n=3): [(0.0, 9), (0.7219280948873623, 0), (0.7219280948873623, 1)] [info] top_knn_i: [2520, 2721, 1242] [info] L_knn(n=3): [0 0 1] ..... top_knn_ij: [9, 0, 1] ..... Pc_756 local: [[-2 -2 2] [-2 -2 2] [-2 -2 2] [-2 1 -1] [-2 -2 2]] [info] Pc_855: [[-2 -2 -2 -2 -2 -2 2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 2 -2 -2 -2] [ 1 1 1 1 1 1 -1 1 1 1]] [info] sorted_knn_i (n=3): [(0.7219280948873623, 0), (0.7219280948873623, 1), (0.7219280948873623, 2)] [info] top_knn_i: [1622, 839, 789] [info] L_knn(n=3): [0 0 0] ..... top_knn_ij: [0, 1, 2] ..... Pc_855 local: [[-2 -2 -2] [-2 -2 -2] [-2 -2 -2] [-2 -2 -2] [ 1 1 1]] [info] Pc_915: [[-2 2 -2 -2 2 2 -2 -2 -2 -2] [-2 2 -2 -2 2 2 -2 -2 -2 -2] [-2 2 -2 -2 2 2 -2 -2 -2 -2] [-2 2 -2 -2 2 2 -2 -2 -2 -2] [ 1 -1 1 1 -1 2 -2 1 -2 1]] [info] sorted_knn_i (n=3): [(0.0, 5), (0.0, 6), (0.0, 8)] [info] top_knn_i: [3231, 1405, 146] [info] L_knn(n=3): [1 0 0] ..... top_knn_ij: [5, 6, 8] ..... Pc_915 local: [[ 2 -2 -2] [ 2 -2 -2] [ 2 -2 -2] [ 2 -2 -2] [ 2 -2 -2]] [info] Pc_1048: [[-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [ 1 1 1 1 1 1 1 1 1 1] [ 1 1 1 1 1 1 1 1 1 1]] [info] sorted_knn_i (n=3): [(0.9709505944546688, 0), (0.9709505944546688, 1), (0.9709505944546688, 2)] [info] top_knn_i: [2485, 372, 3190] [info] L_knn(n=3): [0 0 0] ..... top_knn_ij: [0, 1, 2] ..... Pc_1048 local: [[-2 -2 -2] [-2 -2 -2] [-2 -2 -2] [ 1 1 1] [ 1 1 1]] [info] Pc_1134: [[-2 -2 -2 2 -2 -2 -2 -2 -2 -2] [-2 -2 -2 2 -2 -2 -2 -2 -2 -2] [-2 -2 -2 2 -2 -2 -2 -2 -2 -2] [ 1 1 1 -1 1 -2 1 -2 1 1] [-2 -2 -2 2 1 1 -2 1 -2 1]] [info] sorted_knn_i (n=3): [(0.7219280948873623, 0), (0.7219280948873623, 1), (0.7219280948873623, 2)] [info] top_knn_i: [1344, 1504, 1804] [info] L_knn(n=3): [0 0 0] ..... top_knn_ij: [0, 1, 2] ..... Pc_1134 local: [[-2 -2 -2] [-2 -2 -2] [-2 -2 -2] [ 1 1 1] [-2 -2 -2]] [info] Pc_1227: [[-2 -2 -2 -2 -2 -2 -2 -2 2 -2] [-2 -2 -2 -2 -2 -2 -2 -2 2 -2] [-2 -2 -2 -2 -2 -2 -2 -2 2 -2] [ 1 1 1 1 1 1 1 1 -1 1] [ 1 1 1 1 1 1 1 1 -1 1]] [info] sorted_knn_i (n=3): [(0.9709505944546688, 0), (0.9709505944546688, 1), (0.9709505944546688, 2)] [info] top_knn_i: [2811, 2698, 2029] [info] L_knn(n=3): [0 0 0] ..... top_knn_ij: [0, 1, 2] ..... Pc_1227 local: [[-2 -2 -2] [-2 -2 -2] [-2 -2 -2] [ 1 1 1] [ 1 1 1]]
In [ ]:
Copied!
import utils_knn as uknn
X_test = test_split.X.T
L_test = test_split.L
uknn.analyze_knn(fknn, X_test, L_test, Pc, target_label=1)
import utils_knn as uknn
X_test = test_split.X.T
L_test = test_split.L
uknn.analyze_knn(fknn, X_test, L_test, Pc, target_label=1)
> Positive example #1 > Pc_0: [[ 2 2 2 2 -2 -2 2 2 -2 -2] [ 2 2 2 2 -2 -2 2 2 -2 -2] [ 2 2 2 2 -2 -2 2 2 -2 -2] [ 2 2 2 2 -2 -2 2 2 -2 -2] [ 2 2 2 2 -2 -2 2 2 -2 -2]] > colors: [2, 2, 2, 2, 2] > indices: [1793, 1793, 1793, 1793, 1793] -------------------------------------------------- > Positive example #2 > Pc_1: [[-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [ 1 1 -2 1 1 1 -2 1 1 1] [ 1 1 1 1 1 1 1 1 1 1]] > colors: [-2, -2, -2, 1, 1] > indices: [2422, 2422, 2422, 2422, 2422] -------------------------------------------------- > Positive example #3 > Pc_2: [[-2 -2 -2 2 -2 -2 -2 2 -2 -2] [-2 -2 -2 2 -2 -2 -2 2 -2 -2] [-2 -2 -2 2 -2 -2 -2 2 -2 -2] [ 1 1 1 -1 1 1 1 -1 1 1] [ 1 1 1 -1 1 1 1 -1 1 1]] > colors: [2, 2, 2, 1, 1] > indices: [3238, 3238, 3238, 1705, 1705] -------------------------------------------------- > Positive example #4 > Pc_3: [[-2 -2 -2 -2 2 -2 -2 -2 2 -2] [-2 -2 -2 -2 2 -2 -2 -2 2 -2] [-2 -2 -2 -2 2 -2 -2 -2 2 -2] [-2 -2 -2 -2 2 -2 -2 -2 2 -2] [-2 -2 -2 -2 2 -2 -2 -2 2 -2]] > colors: [2, 2, 2, 2, 2] > indices: [743, 743, 743, 743, 743] -------------------------------------------------- > Positive example #5 > Pc_4: [[-2 2 -2 -2 -2 -2 -2 2 -2 -2] [-2 2 -2 -2 -2 -2 -2 2 -2 -2] [-2 2 -2 -2 -2 -2 -2 2 -2 -2] [ 1 2 1 1 1 1 1 -1 1 1] [ 1 -1 1 1 1 1 1 -1 1 1]] > colors: [2, 2, 2, 1, 1] > indices: [3032, 3032, 3032, 432, 432] -------------------------------------------------- > Positive example #6 > Pc_5: [[-2 -2 -2 -2 2 -2 -2 -2 -2 -2] [-2 -2 -2 -2 2 -2 -2 -2 -2 -2] [-2 -2 -2 -2 2 -2 -2 -2 -2 -2] [ 1 1 1 1 -1 1 1 1 1 1] [ 1 1 1 1 -1 1 1 1 1 1]] > colors: [2, 2, 2, 1, 1] > indices: [2783, 2783, 2783, 3366, 3366] -------------------------------------------------- > Positive example #7 > Pc_6: [[-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [ 1 1 1 1 1 1 1 1 1 1] [ 1 1 1 1 1 1 1 1 1 1]] > colors: [-2, -2, -2, 1, 1] > indices: [444, 444, 444, 444, 444] -------------------------------------------------- > Positive example #8 > Pc_7: [[-2 2 -2 -2 -2 -2 -2 -2 -2 -2] [-2 2 -2 -2 -2 -2 -2 -2 -2 -2] [-2 2 -2 -2 -2 -2 -2 -2 -2 -2] [ 1 -1 1 1 1 1 1 1 1 1] [ 1 -1 1 1 1 1 -2 1 1 1]] > colors: [2, 2, 2, 1, 1] > indices: [3384, 3384, 3384, 2549, 2549] -------------------------------------------------- > Positive example #9 > Pc_8: [[-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [ 1 1 1 1 1 1 1 1 1 1] [ 1 1 1 1 1 1 1 1 1 1]] > colors: [-2, -2, -2, 1, 1] > indices: [1159, 1159, 1159, 1159, 1159] -------------------------------------------------- > Positive example #10 > Pc_9: [[-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [ 1 1 1 1 1 1 1 1 1 -2]] > colors: [-2, -2, -2, -2, 1] > indices: [3479, 3479, 3479, 3479, 3479] -------------------------------------------------- > Positive example #11 > Pc_10: [[-2 -2 -2 -2 -2 -2 -2 -2 2 -2] [-2 -2 -2 -2 -2 -2 -2 -2 2 -2] [-2 -2 -2 -2 -2 -2 -2 -2 2 -2] [-2 -2 -2 -2 -2 -2 -2 -2 2 -2] [-2 -2 -2 -2 -2 -2 -2 -2 2 -2]] > colors: [2, 2, 2, 2, 2] > indices: [2577, 2577, 2577, 2577, 2577] -------------------------------------------------- > Positive example #12 > Pc_11: [[-2 -2 2 -2 -2 -2 2 -2 -2 -2] [-2 -2 2 -2 -2 -2 2 -2 -2 -2] [-2 -2 2 -2 -2 -2 2 -2 -2 -2] [ 1 1 -1 1 1 1 -1 1 1 1] [ 1 1 -1 1 1 1 -1 1 1 1]] > colors: [2, 2, 2, 1, 1] > indices: [2547, 2547, 2547, 2146, 2146] -------------------------------------------------- > Positive example #13 > Pc_12: [[-2 -2 -2 -2 -2 -2 2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 2 -2 -2 -2] [ 1 -2 -2 -2 -2 -2 2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 2 -2 -2 -2] [-2 1 1 1 1 1 -1 -2 1 1]] > colors: [2, 2, 1, 2, 1] > indices: [3000, 3000, 190, 3000, 231] -------------------------------------------------- > Positive example #14 > Pc_13: [[-2 -2 2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 2 -2 -2 -2 -2 -2 -2 -2] [ 1 1 -1 1 1 1 1 1 1 1] [ 1 1 -1 1 1 1 1 1 1 1]] > colors: [2, 2, 2, 1, 1] > indices: [3415, 3415, 3415, 2811, 2811] -------------------------------------------------- > Positive example #15 > Pc_14: [[ 2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [ 2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [ 2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [-1 1 1 1 1 1 1 1 1 1] [ 2 -2 -2 -2 -2 -2 -2 -2 -2 -2]] > colors: [2, 2, 2, 1, 2] > indices: [2838, 2838, 2838, 3363, 2838] -------------------------------------------------- > Positive example #16 > Pc_15: [[-2 -2 -2 -2 -2 -2 2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 2 -2 -2 -2] [ 1 1 1 -2 1 1 -1 1 1 1] [ 1 1 1 1 1 1 -1 1 1 1]] > colors: [2, 2, 2, 1, 1] > indices: [860, 860, 860, 2788, 2788] -------------------------------------------------- > Positive example #17 > Pc_16: [[-2 -2 -2 -2 2 -2 -2 -2 -2 -2] [-2 -2 -2 -2 2 -2 -2 -2 -2 -2] [-2 -2 -2 -2 2 -2 -2 -2 -2 -2] [ 1 -2 1 1 -1 1 1 1 1 1] [ 1 1 1 1 -1 1 1 1 1 1]] > colors: [2, 2, 2, 1, 1] > indices: [21, 21, 21, 1675, 1675] -------------------------------------------------- > Positive example #18 > Pc_17: [[2 2 2 2 2 2 2 2 2 2] [2 2 2 2 2 2 2 2 2 2] [2 2 2 2 2 2 2 2 2 2] [2 2 2 2 2 2 2 2 2 2] [2 2 2 2 2 2 2 2 2 2]] > colors: [2, 2, 2, 2, 2] > indices: [1931, 1931, 1931, 1931, 1931] -------------------------------------------------- > Positive example #19 > Pc_18: [[-2 2 -2 -2 2 -2 -2 -2 -2 -2] [-2 2 -2 -2 2 -2 -2 -2 -2 -2] [-2 2 -2 -2 2 -2 -2 -2 -2 -2] [ 1 -1 1 1 -1 1 1 -2 1 1] [ 1 -1 1 1 -1 1 1 1 1 1]] > colors: [2, 2, 2, 1, 1] > indices: [136, 136, 136, 1703, 1703] -------------------------------------------------- > Positive example #20 > Pc_19: [[2 2 2 2 2 2 2 2 2 2] [2 2 2 2 2 2 2 2 2 2] [2 2 2 2 2 2 2 2 2 2] [2 2 2 2 2 2 2 2 2 2] [2 2 2 2 2 2 2 2 2 2]] > colors: [2, 2, 2, 2, 2] > indices: [1931, 1931, 1931, 1931, 1931] -------------------------------------------------- > Positive example #21 > Pc_20: [[2 2 2 2 2 2 2 2 2 2] [2 2 2 2 2 2 2 2 2 2] [2 2 2 2 2 2 2 2 2 2] [2 2 2 2 2 2 2 2 2 2] [2 2 2 2 2 2 2 2 2 2]] > colors: [2, 2, 2, 2, 2] > indices: [509, 509, 509, 509, 509] -------------------------------------------------- > Positive example #22 > Pc_21: [[-2 -2 -2 -2 -2 -2 2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 2 -2 -2 -2] [ 1 1 1 -2 1 1 -1 1 1 1] [ 1 1 1 1 1 1 -1 1 1 1]] > colors: [2, 2, 2, 1, 1] > indices: [860, 860, 860, 2788, 2788] -------------------------------------------------- > Positive example #23 > Pc_22: [[-2 -2 -2 -2 -2 -2 2 -2 2 2] [-2 -2 -2 -2 -2 -2 2 -2 2 2] [-2 -2 -2 -2 -2 -2 2 -2 2 2] [ 1 1 1 1 1 1 -1 1 -1 -1] [ 1 1 1 1 1 1 -1 1 -1 -1]] > colors: [2, 2, 2, 1, 1] > indices: [2447, 2447, 2447, 3410, 3410] -------------------------------------------------- > Positive example #24 > Pc_23: [[ 2 -2 -2 -2 -2 -2 2 -2 2 -2] [ 2 -2 -2 -2 -2 -2 2 -2 2 -2] [ 2 -2 -2 -2 -2 -2 2 -2 2 -2] [-1 1 1 -2 1 1 -1 1 -1 1] [-1 1 1 1 1 1 -1 1 -1 1]] > colors: [2, 2, 2, 1, 1] > indices: [2796, 2796, 2796, 2562, 2562] -------------------------------------------------- > Positive example #25 > Pc_24: [[-2 -2 -2 2 -2 -2 -2 -2 -2 -2] [-2 -2 -2 2 -2 -2 -2 -2 -2 -2] [-2 -2 -2 2 -2 -2 -2 -2 -2 -2] [ 1 1 1 -1 1 1 1 1 1 1] [-2 -2 -2 2 -2 -2 -2 -2 -2 -2]] > colors: [2, 2, 2, 1, 2] > indices: [2041, 2041, 2041, 3350, 2041] -------------------------------------------------- > Positive example #26 > Pc_25: [[-2 2 -2 -2 -2 -2 -2 -2 2 -2] [-2 2 -2 -2 -2 -2 -2 -2 2 -2] [-2 2 -2 -2 -2 -2 -2 -2 2 -2] [ 1 -1 1 1 1 1 1 1 -1 1] [-2 2 -2 -2 -2 -2 -2 -2 2 -2]] > colors: [2, 2, 2, 1, 2] > indices: [3342, 3342, 3342, 3319, 3342] -------------------------------------------------- > Positive example #27 > Pc_26: [[ 2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [ 2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [ 2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [-1 1 1 1 1 1 1 1 1 1] [-1 1 1 1 1 1 1 1 1 1]] > colors: [2, 2, 2, 1, 1] > indices: [1449, 1449, 1449, 3007, 3007] -------------------------------------------------- > Positive example #28 > Pc_27: [[ 2 -2 -2 -2 -2 -2 2 -2 2 -2] [ 2 -2 -2 -2 -2 -2 2 -2 2 -2] [ 2 -2 -2 -2 -2 -2 2 -2 2 -2] [-1 1 1 -2 1 1 -1 1 -1 1] [-1 1 1 1 1 1 -1 1 -1 1]] > colors: [2, 2, 2, 1, 1] > indices: [2796, 2796, 2796, 2562, 2562] -------------------------------------------------- > Positive example #29 > Pc_28: [[2 2 2 2 2 2 2 2 2 2] [2 2 2 2 2 2 2 2 2 2] [2 2 2 2 2 2 2 2 2 2] [2 2 2 2 2 2 2 2 2 2] [2 2 2 2 2 2 2 2 2 2]] > colors: [2, 2, 2, 2, 2] > indices: [855, 855, 855, 855, 855] -------------------------------------------------- > Positive example #30 > Pc_29: [[ 2 -2 -2 -2 -2 2 -2 -2 2 -2] [ 2 -2 -2 -2 -2 2 -2 -2 2 -2] [ 2 -2 -2 -2 -2 2 -2 -2 2 -2] [-1 1 1 1 1 -1 1 1 -1 1] [-1 1 1 1 1 -1 1 1 -1 1]] > colors: [2, 2, 2, 1, 1] > indices: [2063, 2063, 2063, 205, 205] -------------------------------------------------- > Positive example #31 > Pc_30: [[-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [ 1 1 1 1 1 1 1 1 1 1] [ 1 1 1 1 1 1 1 1 1 1]] > colors: [-2, -2, -2, 1, 1] > indices: [1873, 1873, 1873, 1873, 1873] -------------------------------------------------- > Positive example #32 > Pc_31: [[-2 -2 2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 2 -2 -2 -2 -2 -2 -2 -2] [ 1 1 -1 1 1 1 1 1 1 1] [ 1 1 -1 1 1 1 1 1 1 1]] > colors: [2, 2, 2, 1, 1] > indices: [1972, 1972, 1972, 3468, 3468] -------------------------------------------------- > Positive example #33 > Pc_32: [[ 2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [ 2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [ 2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [-1 1 1 1 1 1 1 1 1 1] [-1 1 1 1 1 1 1 1 1 1]] > colors: [2, 2, 2, 1, 1] > indices: [3458, 3458, 3458, 412, 412] -------------------------------------------------- > Positive example #34 > Pc_33: [[-2 -2 -2 -2 -2 -2 -2 -2 2 -2] [-2 -2 -2 -2 -2 -2 -2 -2 2 -2] [-2 -2 -2 -2 -2 -2 -2 -2 2 -2] [ 1 1 1 1 1 1 1 1 -1 1] [ 1 1 1 1 1 1 1 1 -1 1]] > colors: [2, 2, 2, 1, 1] > indices: [1366, 1366, 1366, 1569, 1569] -------------------------------------------------- > Positive example #35 > Pc_34: [[-2 -2 2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 2 -2 -2 -2 -2 -2 -2 -2] [ 1 1 -1 1 1 1 1 1 1 1]] > colors: [2, 2, 2, 2, 1] > indices: [183, 183, 183, 183, 145] -------------------------------------------------- > Positive example #36 > Pc_35: [[-2 -2 2 -2 -2 -2 2 -2 -2 -2] [-2 -2 2 -2 -2 -2 2 -2 -2 -2] [-2 -2 2 -2 -2 -2 2 -2 -2 -2] [ 1 1 -1 1 1 1 -1 1 1 1] [ 1 1 -1 1 1 1 -1 1 1 1]] > colors: [2, 2, 2, 1, 1] > indices: [2547, 2547, 2547, 2146, 2146] -------------------------------------------------- > Positive example #37 > Pc_36: [[-2 2 -2 -2 -2 -2 -2 -2 2 -2] [-2 2 -2 -2 -2 -2 -2 -2 2 -2] [-2 2 -2 -2 -2 -2 -2 -2 2 -2] [-2 2 -2 -2 -2 -2 -2 -2 2 -2] [ 1 -1 1 1 1 1 1 1 -1 1]] > colors: [2, 2, 2, 2, 1] > indices: [2119, 2119, 2119, 2119, 629] -------------------------------------------------- > Positive example #38 > Pc_37: [[-2 -2 2 -2 -2 -2 2 -2 -2 -2] [-2 -2 2 -2 -2 -2 2 -2 -2 -2] [-2 -2 2 -2 -2 -2 2 -2 -2 -2] [ 1 1 -1 1 1 1 -1 1 1 1] [ 1 1 -1 1 1 1 -1 1 1 1]] > colors: [2, 2, 2, 1, 1] > indices: [2547, 2547, 2547, 2146, 2146] -------------------------------------------------- > Positive example #39 > Pc_38: [[-2 -2 -2 -2 -2 -2 -2 2 -2 -2] [-2 -2 -2 -2 -2 -2 -2 2 -2 -2] [-2 -2 -2 -2 -2 -2 -2 2 -2 -2] [ 1 1 1 1 1 1 1 -1 1 -2] [-2 -2 -2 -2 -2 -2 -2 2 -2 -2]] > colors: [2, 2, 2, 1, 2] > indices: [1165, 1165, 1165, 3529, 1165] -------------------------------------------------- > Positive example #40 > Pc_39: [[-2 -2 -2 -2 -2 -2 2 -2 2 2] [-2 -2 -2 -2 -2 -2 2 -2 2 2] [-2 -2 -2 -2 -2 -2 2 -2 2 2] [ 1 1 1 1 1 1 -1 1 -1 -1] [ 1 1 1 1 1 1 -1 1 -1 -1]] > colors: [2, 2, 2, 1, 1] > indices: [2447, 2447, 2447, 3410, 3410] -------------------------------------------------- > Positive example #41 > Pc_40: [[ 2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [ 2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [ 2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [ 2 -2 1 1 1 1 1 1 1 1] [-1 1 1 1 1 1 1 1 1 1]] > colors: [2, 2, 2, 1, 1] > indices: [1740, 1740, 1740, 3053, 3647] -------------------------------------------------- > Positive example #42 > Pc_41: [[-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [ 1 1 1 1 1 1 1 1 1 1] [ 1 1 1 1 1 1 1 1 1 1]] > colors: [-2, -2, -2, 1, 1] > indices: [2991, 2991, 2991, 2991, 2991] -------------------------------------------------- > Positive example #43 > Pc_42: [[-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [ 1 1 1 1 1 1 1 1 1 1] [ 1 1 1 1 1 1 1 1 1 1]] > colors: [-2, -2, -2, 1, 1] > indices: [2331, 2331, 2331, 2331, 2331] -------------------------------------------------- > Positive example #44 > Pc_43: [[-2 -2 -2 -2 -2 -2 -2 -2 2 -2] [-2 -2 -2 -2 -2 -2 -2 -2 2 -2] [-2 -2 -2 -2 -2 -2 -2 -2 2 -2] [ 1 1 -2 1 1 1 1 1 -1 1] [ 1 1 1 1 1 1 1 1 -1 1]] > colors: [2, 2, 2, 1, 1] > indices: [1865, 1865, 1865, 1692, 1692] -------------------------------------------------- > Positive example #45 > Pc_44: [[-2 -2 -2 -2 -2 -2 -2 -2 2 -2] [-2 -2 -2 -2 -2 -2 -2 -2 2 -2] [-2 -2 -2 -2 -2 -2 -2 -2 2 -2] [ 1 1 1 1 1 1 1 1 -1 1] [ 1 1 1 1 1 1 1 1 -1 1]] > colors: [2, 2, 2, 1, 1] > indices: [2078, 2078, 2078, 1411, 1411] -------------------------------------------------- > Positive example #46 > Pc_45: [[-2 -2 -2 -2 -2 -2 2 -2 2 2] [-2 -2 -2 -2 -2 -2 2 -2 2 2] [-2 -2 -2 -2 -2 -2 2 -2 2 2] [ 1 1 1 1 1 1 -1 1 -1 -1] [ 1 1 1 1 1 1 -1 1 -1 -1]] > colors: [2, 2, 2, 1, 1] > indices: [2447, 2447, 2447, 3410, 3410] -------------------------------------------------- > Positive example #47 > Pc_46: [[-2 2 -2 -2 -2 -2 -2 2 -2 -2] [-2 2 -2 -2 -2 -2 -2 2 -2 -2] [-2 2 -2 -2 -2 -2 -2 2 -2 -2] [ 1 2 1 1 1 1 1 -1 1 1] [ 1 -1 1 1 1 1 1 -1 1 1]] > colors: [2, 2, 2, 1, 1] > indices: [3032, 3032, 3032, 432, 432] -------------------------------------------------- > Positive example #48 > Pc_47: [[-2 -2 -2 2 -2 -2 -2 2 -2 -2] [-2 -2 -2 2 -2 -2 -2 2 -2 -2] [-2 -2 -2 2 -2 -2 -2 2 -2 -2] [ 1 1 1 -1 1 1 1 -1 1 1] [ 1 1 1 -1 1 1 1 -1 1 1]] > colors: [2, 2, 2, 1, 1] > indices: [3238, 3238, 3238, 1705, 1705] -------------------------------------------------- > Positive example #49 > Pc_48: [[-2 2 -2 -2 -2 -2 -2 2 -2 -2] [-2 2 -2 -2 -2 -2 -2 2 -2 -2] [-2 2 -2 -2 -2 -2 -2 2 -2 -2] [ 1 2 1 1 1 1 1 -1 1 1] [ 1 -1 1 1 1 1 1 -1 1 1]] > colors: [2, 2, 2, 1, 1] > indices: [3032, 3032, 3032, 432, 432] -------------------------------------------------- > Positive example #50 > Pc_49: [[-2 2 -2 -2 -2 -2 -2 -2 -2 -2] [-2 2 -2 -2 -2 -2 -2 -2 -2 -2] [-2 2 -2 -2 -2 -2 -2 -2 -2 -2] [ 1 -1 1 1 1 1 -2 1 1 1] [ 1 -1 1 1 1 1 1 1 1 1]] > colors: [2, 2, 2, 1, 1] > indices: [622, 622, 622, 344, 344] -------------------------------------------------- [info] Found 36 cases for which the majority 'color' does not come from the same training instance
- Also observe the color patterns associated with negative examples (majority class)
In [ ]:
Copied!
uknn.analyze_knn(fknn, X_test, L_test, Pc, target_label=0)
uknn.analyze_knn(fknn, X_test, L_test, Pc, target_label=0)
> Negative example #1 > Pc_0: [[-2 -2 -2 -2 2 -2 -2 -2 -2 -2] [-2 -2 -2 -2 2 -2 -2 -2 -2 -2] [-2 -2 -2 -2 2 -2 -2 -2 -2 -2] [ 1 1 1 1 -1 1 1 1 1 1] [ 1 1 1 1 -1 1 1 1 1 1]] > colors: [2, 2, 2, 1, 1] > indices: [906, 906, 906, 123, 123] -------------------------------------------------- > Negative example #2 > Pc_1: [[-2 -2 2 -2 -2 -2 -2 -2 2 -2] [-2 -2 2 -2 -2 -2 -2 -2 2 -2] [-2 -2 2 -2 -2 -2 -2 -2 2 1] [-2 -2 2 -2 -2 -2 -2 -2 2 -2] [ 1 1 -1 1 1 1 1 1 -1 1]] > colors: [2, 2, 2, 2, 1] > indices: [1301, 1301, 1301, 1301, 889] -------------------------------------------------- > Negative example #3 > Pc_2: [[-2 -2 -2 -2 -2 -2 -2 -2 2 2] [-2 -2 -2 -2 -2 -2 -2 -2 2 2] [-2 -2 -2 -2 -2 -2 -2 -2 2 2] [-2 1 1 1 1 1 -2 1 -1 -1] [-2 1 -2 1 1 1 1 -2 -1 -1]] > colors: [2, 2, 2, 1, 1] > indices: [2962, 2962, 2962, 2187, 2187] -------------------------------------------------- > Negative example #4 > Pc_3: [[-2 -2 -2 -2 -2 2 -2 -2 -2 -2] [-2 -2 -2 -2 -2 2 -2 -2 -2 -2] [-2 -2 -2 -2 -2 2 -2 -2 -2 -2] [-2 -2 -2 1 1 -1 1 1 1 1] [ 1 1 1 1 1 -1 1 1 1 1]] > colors: [2, 2, 2, 1, 1] > indices: [2063, 2063, 2063, 2840, 3296] -------------------------------------------------- > Negative example #5 > Pc_4: [[-2 -2 2 2 -2 -2 -2 -2 -2 2] [-2 -2 2 2 -2 -2 -2 -2 -2 2] [-2 -2 2 2 -2 -2 -2 -2 -2 2] [ 1 1 -1 -1 1 1 1 -2 1 -1] [ 1 1 -1 -1 1 1 1 1 1 -1]] > colors: [2, 2, 2, 1, 1] > indices: [3125, 3125, 3125, 1668, 1668] -------------------------------------------------- > Negative example #6 > Pc_5: [[-2 2 -2 -2 -2 2 -2 2 2 -2] [-2 2 -2 -2 -2 2 -2 2 2 -2] [-2 2 -2 -2 -2 2 -2 2 2 -2] [-2 2 -2 -2 -2 2 -2 2 2 -2] [-2 2 -2 -2 -2 2 -2 2 2 -2]] > colors: [2, 2, 2, 2, 2] > indices: [2834, 2834, 2834, 2834, 2834] -------------------------------------------------- > Negative example #7 > Pc_6: [[-2 -2 -2 2 -2 -2 -2 -2 -2 -2] [-2 -2 -2 2 -2 -2 -2 -2 -2 -2] [-2 -2 -2 2 -2 -2 -2 -2 -2 -2] [-2 -2 -2 2 -2 -2 -2 -2 -2 -2] [ 1 1 1 -1 1 1 1 1 1 1]] > colors: [2, 2, 2, 2, 1] > indices: [203, 203, 203, 203, 1259] -------------------------------------------------- > Negative example #8 > Pc_7: [[ 2 -2 -2 2 -2 -2 -2 -2 -2 -2] [ 2 -2 -2 2 -2 -2 -2 -2 -2 -2] [ 2 -2 -2 2 -2 -2 -2 -2 -2 -2] [ 2 -2 -2 2 -2 -2 -2 -2 -2 -2] [-1 1 1 -1 1 1 1 1 1 -2]] > colors: [2, 2, 2, 2, 1] > indices: [184, 184, 184, 184, 1200] -------------------------------------------------- > Negative example #9 > Pc_8: [[-2 -2 -2 -2 2 -2 2 -2 -2 -2] [-2 -2 -2 -2 2 -2 2 -2 -2 -2] [-2 -2 -2 -2 2 -2 2 -2 -2 -2] [-2 -2 -2 -2 2 -2 2 -2 -2 -2] [ 1 1 1 1 -1 1 -1 1 1 1]] > colors: [2, 2, 2, 2, 1] > indices: [2558, 2558, 2558, 2558, 2348] -------------------------------------------------- > Negative example #10 > Pc_9: [[ 2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [ 2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [ 2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [-1 1 1 1 1 1 1 1 1 1] [-1 1 1 1 1 1 1 1 1 1]] > colors: [2, 2, 2, 1, 1] > indices: [1807, 1807, 1807, 304, 304] -------------------------------------------------- > Negative example #11 > Pc_10: [[-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [ 1 1 1 1 1 1 1 1 1 1]] > colors: [-2, -2, -2, -2, 1] > indices: [2821, 2821, 2821, 2821, 2821] -------------------------------------------------- > Negative example #12 > Pc_11: [[-2 -2 -2 -2 2 -2 -2 -2 -2 -2] [-2 -2 -2 -2 2 -2 -2 -2 -2 -2] [-2 -2 -2 -2 2 -2 -2 -2 -2 -2] [ 1 1 1 1 -1 1 1 -2 1 1] [ 1 1 1 1 -1 1 1 1 1 1]] > colors: [2, 2, 2, 1, 1] > indices: [2467, 2467, 2467, 1810, 1810] -------------------------------------------------- > Negative example #13 > Pc_12: [[-2 -2 -2 -2 2 -2 -2 -2 -2 -2] [-2 -2 -2 -2 2 -2 -2 -2 -2 -2] [-2 -2 -2 -2 2 -2 -2 -2 -2 -2] [-2 -2 -2 -2 2 -2 -2 -2 -2 -2] [ 1 1 -2 -2 -1 -2 1 1 1 -2]] > colors: [2, 2, 2, 2, 1] > indices: [27, 27, 27, 27, 1180] -------------------------------------------------- > Negative example #14 > Pc_13: [[-2 -2 -2 -2 -2 -2 2 -2 2 -2] [-2 -2 -2 -2 -2 -2 2 -2 2 -2] [-2 -2 -2 -2 -2 -2 2 -2 2 -2] [ 1 1 -2 1 1 1 -1 1 -1 -2] [ 1 1 1 1 1 1 -1 1 -1 1]] > colors: [2, 2, 2, 1, 1] > indices: [1865, 1865, 1865, 437, 437] -------------------------------------------------- > Negative example #15 > Pc_14: [[-2 -2 -2 -2 -2 -2 2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 2 -2 -2 -2] [-2 1 1 1 1 1 -1 1 1 1] [ 1 1 1 1 1 1 -1 1 1 1]] > colors: [2, 2, 2, 1, 1] > indices: [1798, 1798, 1798, 1630, 3536] -------------------------------------------------- > Negative example #16 > Pc_15: [[-2 -2 -2 -2 -2 -2 2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 2 -2 -2 -2] [-2 -2 -2 1 -2 -2 2 -2 1 -2] [ 1 1 1 1 1 1 -1 1 1 -2] [ 1 1 1 1 1 1 -1 1 1 1]] > colors: [2, 2, 1, 1, 1] > indices: [1785, 1785, 475, 180, 180] -------------------------------------------------- > Negative example #17 > Pc_16: [[-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [ 1 1 1 1 1 1 1 1 1 1] [ 1 1 1 1 1 1 1 1 1 1]] > colors: [-2, -2, -2, 1, 1] > indices: [1406, 1406, 1406, 1406, 1406] -------------------------------------------------- > Negative example #18 > Pc_17: [[-2 -2 -2 2 2 -2 -2 -2 -2 -2] [-2 -2 -2 2 2 -2 -2 -2 -2 -2] [-2 -2 -2 2 2 -2 -2 -2 -2 -2] [ 1 -2 1 -1 -1 1 1 1 1 1] [ 1 1 1 -1 -1 1 1 1 1 1]] > colors: [2, 2, 2, 1, 1] > indices: [2117, 2117, 2117, 940, 940] -------------------------------------------------- > Negative example #19 > Pc_18: [[-2 -2 2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 2 -2 -2 -2 -2 -2 -2 -2]] > colors: [2, 2, 2, 2, 2] > indices: [1316, 1316, 1316, 1316, 1316] -------------------------------------------------- > Negative example #20 > Pc_19: [[-2 -2 -2 -2 2 -2 -2 2 -2 -2] [-2 -2 -2 -2 2 -2 -2 2 -2 -2] [-2 -2 -2 -2 2 -2 -2 2 -2 -2] [ 1 1 1 1 -1 1 1 -1 1 1] [ 1 1 1 1 -1 1 1 -1 1 1]] > colors: [2, 2, 2, 1, 1] > indices: [2547, 2547, 2547, 1641, 1641] -------------------------------------------------- > Negative example #21 > Pc_20: [[ 2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [ 2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [ 2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [-1 1 1 1 1 1 1 1 1 1] [ 2 -2 -2 -2 -2 -2 -2 -2 -2 -2]] > colors: [2, 2, 2, 1, 2] > indices: [2218, 2218, 2218, 3626, 2218] -------------------------------------------------- > Negative example #22 > Pc_21: [[-2 2 -2 2 -2 -2 -2 -2 -2 -2] [-2 2 -2 2 -2 -2 -2 -2 -2 -2] [-2 2 -2 2 -2 -2 -2 -2 -2 -2] [ 1 -1 1 -1 1 -2 -2 1 1 -2] [-2 -1 -2 -1 1 1 1 -2 1 1]] > colors: [2, 2, 2, 1, 1] > indices: [2111, 2111, 2111, 1715, 1511] -------------------------------------------------- > Negative example #23 > Pc_22: [[-2 -2 -2 -2 -2 -2 2 2 -2 2] [-2 -2 -2 -2 -2 -2 2 2 -2 2] [-2 -2 -2 -2 -2 -2 2 2 -2 2] [ 1 -2 1 1 1 1 -1 -1 1 -1] [ 1 1 1 1 1 1 -1 -1 1 -1]] > colors: [2, 2, 2, 1, 1] > indices: [954, 954, 954, 3455, 3455] -------------------------------------------------- > Negative example #24 > Pc_23: [[-2 -2 -2 2 2 -2 -2 -2 -2 -2] [-2 -2 -2 2 2 -2 -2 -2 -2 -2] [-2 -2 -2 2 2 -2 -2 -2 -2 -2] [ 1 1 1 -1 2 1 1 1 1 1] [-2 -2 -2 2 2 -2 -2 -2 -2 -2]] > colors: [2, 2, 2, 1, 2] > indices: [2890, 2890, 2890, 128, 2890] -------------------------------------------------- > Negative example #25 > Pc_24: [[-2 -2 -2 -2 -2 -2 -2 -2 -2 2] [-2 -2 -2 -2 -2 -2 -2 -2 -2 2] [-2 -2 -2 -2 -2 -2 -2 -2 -2 2] [ 1 1 1 1 1 1 1 1 -2 -1] [-2 -2 -2 -2 -2 1 -2 -2 -2 2]] > colors: [2, 2, 2, 1, 1] > indices: [3193, 3193, 3193, 2430, 273] -------------------------------------------------- > Negative example #26 > Pc_25: [[-2 -2 -2 -2 -2 -2 -2 -2 -2 2] [-2 -2 -2 -2 -2 -2 -2 -2 -2 2] [-2 -2 -2 -2 -2 -2 -2 -2 -2 2] [ 1 1 1 1 1 1 1 -2 1 -1] [ 1 1 1 1 1 1 1 1 1 -1]] > colors: [2, 2, 2, 1, 1] > indices: [1865, 1865, 1865, 1692, 1692] -------------------------------------------------- > Negative example #27 > Pc_26: [[-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [ 1 1 1 1 -2 1 1 1 1 1]] > colors: [-2, -2, -2, -2, 1] > indices: [2442, 2442, 2442, 2442, 2442] -------------------------------------------------- > Negative example #28 > Pc_27: [[-2 2 -2 -2 -2 -2 -2 -2 -2 -2] [-2 2 -2 -2 -2 -2 -2 -2 -2 -2] [-2 2 -2 -2 -2 -2 -2 -2 -2 -2] [ 1 -1 -2 1 -2 1 1 1 -2 1] [-2 2 -2 -2 -2 -2 -2 -2 -2 -2]] > colors: [2, 2, 2, 1, 2] > indices: [3152, 3152, 3152, 3541, 3152] -------------------------------------------------- > Negative example #29 > Pc_28: [[ 2 -2 2 -2 -2 -2 -2 -2 -2 -2] [ 2 -2 2 -2 -2 -2 -2 -2 -2 -2] [ 2 -2 2 -2 -2 -2 -2 -2 -2 -2] [-1 1 2 1 1 1 1 1 1 1] [-1 1 -1 1 1 1 1 1 1 1]] > colors: [2, 2, 2, 1, 1] > indices: [1586, 1586, 1586, 803, 803] -------------------------------------------------- > Negative example #30 > Pc_29: [[-2 2 -2 -2 -2 -2 2 -2 -2 -2] [-2 2 -2 -2 -2 -2 2 -2 -2 -2] [-2 2 -2 -2 -2 -2 2 -2 -2 -2] [ 1 -1 1 1 1 1 -1 1 1 -2] [ 1 -1 1 1 1 1 -1 1 1 1]] > colors: [2, 2, 2, 1, 1] > indices: [1998, 1998, 1998, 2118, 2118] -------------------------------------------------- > Negative example #31 > Pc_30: [[-2 -2 -2 -2 -2 -2 -2 -2 -2 2] [-2 -2 -2 -2 -2 -2 -2 -2 -2 2] [-2 -2 -2 -2 -2 -2 -2 -2 -2 2] [ 1 1 1 1 1 -2 1 1 1 -1] [ 1 1 1 1 1 1 1 1 1 -1]] > colors: [2, 2, 2, 1, 1] > indices: [1802, 1802, 1802, 1315, 1315] -------------------------------------------------- > Negative example #32 > Pc_31: [[-2 -2 -2 -2 -2 -2 -2 -2 2 2] [-2 -2 -2 -2 -2 -2 -2 -2 2 2] [-2 -2 -2 -2 -2 -2 -2 -2 2 2] [ 1 1 1 1 1 1 1 1 -1 -1] [ 1 1 1 1 1 1 1 1 -1 -1]] > colors: [2, 2, 2, 1, 1] > indices: [1366, 1366, 1366, 2622, 2622] -------------------------------------------------- > Negative example #33 > Pc_32: [[-2 -2 -2 -2 -2 -2 2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 2 -2 -2 -2] [ 1 1 -2 1 -2 1 2 1 1 1] [-2 1 1 1 -2 -2 -1 -2 1 1]] > colors: [2, 2, 2, 1, 1] > indices: [348, 348, 348, 381, 2169] -------------------------------------------------- > Negative example #34 > Pc_33: [[-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [ 1 1 1 1 1 1 1 -2 1 1] [ 1 1 1 1 1 1 1 1 1 1]] > colors: [-2, -2, -2, 1, 1] > indices: [3635, 3635, 3635, 3635, 3635] -------------------------------------------------- > Negative example #35 > Pc_34: [[-2 2 2 -2 2 -2 -2 -2 -2 -2] [-2 2 2 -2 2 -2 -2 -2 -2 -2] [-2 2 2 -2 2 -2 -2 -2 -2 -2] [ 1 -1 -1 1 -1 1 1 1 1 1] [ 1 -1 -1 1 -1 1 1 1 1 1]] > colors: [2, 2, 2, 1, 1] > indices: [194, 194, 194, 314, 314] -------------------------------------------------- > Negative example #36 > Pc_35: [[-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [ 1 1 1 1 1 1 1 1 1 1] [ 1 1 1 1 1 1 1 1 1 1]] > colors: [-2, -2, -2, 1, 1] > indices: [3313, 3313, 3313, 3313, 3313] -------------------------------------------------- > Negative example #37 > Pc_36: [[-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [ 1 1 1 -2 1 1 1 1 1 1] [ 1 1 1 1 1 1 -2 -2 1 1]] > colors: [-2, -2, -2, 1, 1] > indices: [3531, 3531, 3531, 3531, 3531] -------------------------------------------------- > Negative example #38 > Pc_37: [[-2 2 -2 2 -2 -2 -2 -2 -2 2] [-2 2 -2 2 -2 -2 -2 -2 -2 2] [-2 2 -2 2 -2 -2 -2 -2 -2 2] [ 1 -1 1 -1 1 -2 1 1 1 -1] [-2 2 -2 2 -2 -2 -2 -2 -2 2]] > colors: [2, 2, 2, 1, 2] > indices: [92, 92, 92, 2612, 92] -------------------------------------------------- > Negative example #39 > Pc_38: [[-2 -2 -2 -2 -2 -2 -2 -2 2 -2] [-2 -2 -2 -2 -2 -2 -2 -2 2 -2] [-2 -2 -2 -2 -2 -2 -2 -2 2 -2] [ 1 1 1 1 1 1 1 1 2 1] [ 1 1 1 1 1 1 1 1 -1 1]] > colors: [2, 2, 2, 1, 1] > indices: [3011, 3011, 3011, 3623, 3623] -------------------------------------------------- > Negative example #40 > Pc_39: [[-2 2 -2 -2 -2 -2 -2 -2 -2 -2] [-2 2 -2 -2 -2 -2 -2 -2 -2 -2] [-2 2 -2 -2 -2 -2 -2 -2 -2 -2] [ 1 -1 -2 1 -2 1 1 1 -2 1] [-2 2 -2 -2 -2 -2 -2 -2 -2 -2]] > colors: [2, 2, 2, 1, 2] > indices: [3152, 3152, 3152, 3541, 3152] -------------------------------------------------- > Negative example #41 > Pc_40: [[-2 -2 -2 -2 -2 2 2 -2 -2 -2] [-2 -2 -2 -2 -2 2 2 -2 -2 -2] [-2 -2 -2 -2 -2 2 2 -2 -2 -2] [ 1 1 -2 1 1 -1 -1 1 -2 1] [ 1 1 1 1 1 -1 -1 1 1 1]] > colors: [2, 2, 2, 1, 1] > indices: [2316, 2316, 2316, 1606, 1606] -------------------------------------------------- > Negative example #42 > Pc_41: [[-2 -2 -2 -2 -2 -2 2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 2 -2 -2 -2] [ 1 1 1 1 1 1 -1 1 1 1]] > colors: [2, 2, 2, 2, 1] > indices: [944, 944, 944, 944, 124] -------------------------------------------------- > Negative example #43 > Pc_42: [[-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [ 1 1 1 1 1 1 1 1 1 1] [ 1 1 1 1 1 1 1 1 1 1]] > colors: [-2, -2, -2, 1, 1] > indices: [1352, 1352, 1352, 1352, 1352] -------------------------------------------------- > Negative example #44 > Pc_43: [[ 2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [ 2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [ 2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [ 2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [ 2 -2 -2 -2 -2 -2 1 -2 -2 1]] > colors: [2, 2, 2, 2, 1] > indices: [1194, 1194, 1194, 1194, 567] -------------------------------------------------- > Negative example #45 > Pc_44: [[-2 -2 2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 2 -2 -2 -2 -2 -2 -2 -2] [ 1 1 -1 1 1 1 1 1 1 1]] > colors: [2, 2, 2, 2, 1] > indices: [3000, 3000, 3000, 3000, 585] -------------------------------------------------- > Negative example #46 > Pc_45: [[-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [ 1 1 1 1 1 1 1 1 1 1]] > colors: [-2, -2, -2, -2, 1] > indices: [403, 403, 403, 403, 403] -------------------------------------------------- > Negative example #47 > Pc_46: [[-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [ 1 1 -2 1 1 1 1 1 1 1] [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]] > colors: [-2, -2, -2, 1, -2] > indices: [1781, 1781, 1781, 1781, 1781] -------------------------------------------------- > Negative example #48 > Pc_47: [[ 2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [ 2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [ 2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [ 2 1 1 1 1 -2 1 1 1 -2] [-1 1 1 -2 1 1 -2 -2 1 -2]] > colors: [2, 2, 2, 1, 1] > indices: [348, 348, 348, 2860, 2860] -------------------------------------------------- > Negative example #49 > Pc_48: [[-2 2 -2 -2 2 -2 -2 2 -2 -2] [-2 2 -2 -2 2 -2 -2 2 -2 -2] [-2 2 -2 -2 2 -2 -2 2 -2 -2] [ 1 -1 1 -2 -1 1 1 -1 1 1] [ 1 -1 1 1 2 1 -2 -1 1 1]] > colors: [2, 2, 2, 1, 1] > indices: [1551, 1551, 1551, 3333, 3333] -------------------------------------------------- > Negative example #50 > Pc_49: [[ 2 -2 -2 -2 2 -2 -2 -2 -2 -2] [ 2 -2 -2 -2 2 -2 -2 -2 -2 -2] [ 2 -2 -2 -2 2 -2 -2 -2 -2 -2] [-1 1 1 1 -1 1 1 1 1 1] [ 2 -2 -2 -2 2 -2 -2 -2 -2 -2]] > colors: [2, 2, 2, 1, 2] > indices: [3587, 3587, 3587, 2035, 3587] -------------------------------------------------- [info] Found 39 cases for which the majority 'color' does not come from the same training instance
In [ ]:
Copied!
import itertools
distances, indices = fknn.search(test_split.X.T)
n_test = indices.shape[0]
i = np.random.choice(range(n_test), 1)[0]
pairs = itertools.product([indices[i][0], ], indices[i, :])
for i, (u, v) in enumerate(list(pairs)):
print(f"Pair #{i}: {u} vs {v}")
print(X_train[u])
print(X_train[v])
print("-" * 50)
print("> Distance=", LA.norm(X_train[u]-X_train[v], 2), '\n')
import itertools
distances, indices = fknn.search(test_split.X.T)
n_test = indices.shape[0]
i = np.random.choice(range(n_test), 1)[0]
pairs = itertools.product([indices[i][0], ], indices[i, :])
for i, (u, v) in enumerate(list(pairs)):
print(f"Pair #{i}: {u} vs {v}")
print(X_train[u])
print(X_train[v])
print("-" * 50)
print("> Distance=", LA.norm(X_train[u]-X_train[v], 2), '\n')
Pair #0: 2528 vs 2528 [0.501 0. 0.124 0. 0.003] [0.501 0. 0.124 0. 0.003] -------------------------------------------------- > Distance= 0.0 Pair #1: 2528 vs 2103 [0.501 0. 0.124 0. 0.003] [0.5 0. 0.126 0. 0.004] -------------------------------------------------- > Distance= 0.0027999465682221406 Pair #2: 2528 vs 750 [0.501 0. 0.124 0. 0.003] [0.5 0. 0.126 0. 0.004] -------------------------------------------------- > Distance= 0.0030372388203489453 Pair #3: 2528 vs 1140 [0.501 0. 0.124 0. 0.003] [0.5 0. 0.122 0. 0.003] -------------------------------------------------- > Distance= 0.001741746473479801 Pair #4: 2528 vs 600 [0.501 0. 0.124 0. 0.003] [0.501 0. 0.127 0. 0.002] -------------------------------------------------- > Distance= 0.0032291528295385515 Pair #5: 2528 vs 2159 [0.501 0. 0.124 0. 0.003] [0.5 0. 0.127 0. 0.002] -------------------------------------------------- > Distance= 0.0037394488048762384 Pair #6: 2528 vs 1542 [0.501 0. 0.124 0. 0.003] [0.5 0. 0.124 0. 0.006] -------------------------------------------------- > Distance= 0.0034048678280782113 Pair #7: 2528 vs 3598 [0.501 0. 0.124 0. 0.003] [0.499 0. 0.124 0. 0.006] -------------------------------------------------- > Distance= 0.003679200041038483 Pair #8: 2528 vs 2056 [0.501 0. 0.124 0. 0.003] [0.5 0. 0.127 0. 0.005] -------------------------------------------------- > Distance= 0.004036093463855927 Pair #9: 2528 vs 3211 [0.501 0. 0.124 0. 0.003] [0.499 0. 0.127 0. 0.005] -------------------------------------------------- > Distance= 0.004580235491064864
Guesstimating the labeling for test split (T)¶
- Estimated labeling (
lh) vs true labels (y_true)
In [ ]:
Copied!
# Restore confidence matrices (in case if modified)
Pc, C0, Cw, Cn, *rest = \
uc.evalConfidenceMatrices(R, L_train, alpha=alpha,
p_threshold=p_threshold,
conf_measure=conf_measure, policy_threshold=policy_threshold,
# Optional debug/test parameters
U=U, n_train=n_train, fold_number=fold_number,
is_cascade=True,
verbose=0)
assert (Pc.shape == R.shape) and (Cn.shape == R.shape)
# Restore confidence matrices (in case if modified)
Pc, C0, Cw, Cn, *rest = \
uc.evalConfidenceMatrices(R, L_train, alpha=alpha,
p_threshold=p_threshold,
conf_measure=conf_measure, policy_threshold=policy_threshold,
# Optional debug/test parameters
U=U, n_train=n_train, fold_number=fold_number,
is_cascade=True,
verbose=0)
assert (Pc.shape == R.shape) and (Cn.shape == R.shape)
(make_cn) Using WEIGHTED confidence matrix to approximate ratings ...
Guesstimated labeling of T via majority vote¶
In [ ]:
Copied!
def f_score(precision, recall, beta=1.0):
f_beta = (1 + beta**2) * (precision * recall) / ((beta**2 * precision) + recall)
return f_beta
# from common import f_score
# Color matric and labeling matrix under gold standard
Pc_true, Lh = pmodel.color_matrix(T, L_test, p_threshold)
# labeling by majority vote
lh_max_vote = uc.estimateLabels(T, p_th=p_threshold, pos_label=1)
acc_max_vote = np.sum(lh_max_vote == L_test) / (len(L_test)+0.0)
# workflow: p_threshold -> lh -> color matrix
Pc_maxvote, Lh0 = pmodel.color_matrix(T, lh_max_vote, p_threshold) # Mc: Color matrix evaluated via estimated labels
Pf_maxvote = pmodel.to_preference(Pc_maxvote, neutral=0.0)
# => {TP, TN}-entries are desirable and thus encoded as 1s in `Pf_maxvote` whereas {FP, FN}-entries are not desirable hence encoded as 0s
metrics = pmodel.eval_estimated_probability_filter(Pf_maxvote, T, L_test, p_threshold, eps=1e-3)
highlight("Guesstimated labeling (on T) via majority vote")
print(f"> Labeling accuracy: {acc_max_vote}")
print(f"> Reliable-to-correct ratio: {metrics['p_overlap']}") # Fraction of entries predicted reliable and are actually correct (TPs or TNs)
print(f"> Precision: {metrics['precision']}, Recall: {metrics['recall']}")
print(f"> Predcitio(TP): {metrics['precision_tp']}, Recall(TP): {metrics['recall_tp']} => f1(TP): {f_score(metrics['precision_tp'], metrics['recall_tp'])}")
print(f"> Error rate: {metrics['p_missed']}") # Probability of predicting reliable but hitting either FPs or FNs
def f_score(precision, recall, beta=1.0):
f_beta = (1 + beta**2) * (precision * recall) / ((beta**2 * precision) + recall)
return f_beta
# from common import f_score
# Color matric and labeling matrix under gold standard
Pc_true, Lh = pmodel.color_matrix(T, L_test, p_threshold)
# labeling by majority vote
lh_max_vote = uc.estimateLabels(T, p_th=p_threshold, pos_label=1)
acc_max_vote = np.sum(lh_max_vote == L_test) / (len(L_test)+0.0)
# workflow: p_threshold -> lh -> color matrix
Pc_maxvote, Lh0 = pmodel.color_matrix(T, lh_max_vote, p_threshold) # Mc: Color matrix evaluated via estimated labels
Pf_maxvote = pmodel.to_preference(Pc_maxvote, neutral=0.0)
# => {TP, TN}-entries are desirable and thus encoded as 1s in `Pf_maxvote` whereas {FP, FN}-entries are not desirable hence encoded as 0s
metrics = pmodel.eval_estimated_probability_filter(Pf_maxvote, T, L_test, p_threshold, eps=1e-3)
highlight("Guesstimated labeling (on T) via majority vote")
print(f"> Labeling accuracy: {acc_max_vote}")
print(f"> Reliable-to-correct ratio: {metrics['p_overlap']}") # Fraction of entries predicted reliable and are actually correct (TPs or TNs)
print(f"> Precision: {metrics['precision']}, Recall: {metrics['recall']}")
print(f"> Predcitio(TP): {metrics['precision_tp']}, Recall(TP): {metrics['recall_tp']} => f1(TP): {f_score(metrics['precision_tp'], metrics['recall_tp'])}")
print(f"> Error rate: {metrics['p_missed']}") # Probability of predicting reliable but hitting either FPs or FNs
================================================================================ Guesstimated labeling (on T) via majority vote ================================================================================ > Labeling accuracy: 0.54 > Reliable-to-correct ratio: 0.54 > Precision: 0.5288064483522509, Recall: 0.6634396052147127 > Predcitio(TP): 0.06301438708477698, Recall(TP): 0.6712310377231844 => f1(TP): 0.1152127367915651 > Error rate: 0.00012119165978768243
Guesstimated labeling of T via kNNs¶
In [ ]:
Copied!
# We already have the color matrix for the training split (R) but let's verify
Pc_verify, Lh_train = pmodel.color_matrix(R, L_train, p_threshold)
Pc_train = Pc.A if is_sparse(Pc) else Pc
assert np.all(Pc_verify == Pc_train)
# Color matrix for T can only be estimated as we do not know `L_test` in general
ratios = uknn.estimate_ratios(fknn, R, Pc, n_samples=30)
print(ratios)
# lh_color = pmodel.color_matrix_to_labels(Pc)
# acc_color = np.sum(lh_color == L_test) / (len(L_test)+0.0)
# We already have the color matrix for the training split (R) but let's verify
Pc_verify, Lh_train = pmodel.color_matrix(R, L_train, p_threshold)
Pc_train = Pc.A if is_sparse(Pc) else Pc
assert np.all(Pc_verify == Pc_train)
# Color matrix for T can only be estimated as we do not know `L_test` in general
ratios = uknn.estimate_ratios(fknn, R, Pc, n_samples=30)
print(ratios)
# lh_color = pmodel.color_matrix_to_labels(Pc)
# acc_color = np.sum(lh_color == L_test) / (len(L_test)+0.0)
{1: 0.6760969445202233, 0: 0.6760969445202233}
In [ ]:
Copied!
def color_vector(col_vec, label, p_th, reduced_negative=False, pos_label=1, neg_label=0):
"""
"""
col_vec = np.asarray(col_vec)
# if col_vec.ndim == 1:
# pass # no-op
if col_vec.ndim == 2:
assert np.squeeze(col_vec).ndim == 1
col_vec = col_vec.reshape(-1, 1) # turn into a column vector
Pc_i, Lh_i = pmodel.color_matrix(col_vec, np.asarray([label, ]), p_th=p_th)
colors = np.squeeze(Pc_i)
assert colors.ndim == 1
return colors
print(list(R[:, 3]))
print(color_vector(R[:, 3], label=1, p_th=p_threshold))
print(color_vector(R[:, 3], label=0, p_th=p_threshold))
matching_fn = uknn.estimate_labels_by_matching(fknn, R, Pc, p_threshold, verbose=1)
lh_knn = matching_fn(T)
acc_knn = np.sum(lh_knn == L_test) / (len(L_test)+0.0)
Pc_knn, Lh1 = pmodel.color_matrix(T, lh_knn, p_threshold) # Mc: Color matrix evaluated via estimated labels
Pf_knn = pmodel.to_preference(Pc_knn, neutral=0.0)
metrics = pmodel.eval_estimated_probability_filter(Pf_knn, T, L_test, p_threshold, eps=1e-3)
highlight("Guesstimated labeling (on T) via kNN")
print(f"> Labeling accuracy: {acc_knn}")
print(f"> Reliable-to-correct ratio: {metrics['p_overlap']}") # Fraction of entries predicted reliable and are actually correct (TPs or TNs)
# ... downside is that high accuracy could be due to TNs but not TPs
print(f"> Precision: {metrics['precision']}, Recall: {metrics['recall']}")
print(f"> Predcitio(TP): {metrics['precision_tp']}, Recall(TP): {metrics['recall_tp']} => f1(TP): {f_score(metrics['precision_tp'], metrics['recall_tp'])}")
print(f"> Error rate: {metrics['p_missed']}") # Probability of predicting reliable but hitting either FPs or FNs
def color_vector(col_vec, label, p_th, reduced_negative=False, pos_label=1, neg_label=0):
"""
"""
col_vec = np.asarray(col_vec)
# if col_vec.ndim == 1:
# pass # no-op
if col_vec.ndim == 2:
assert np.squeeze(col_vec).ndim == 1
col_vec = col_vec.reshape(-1, 1) # turn into a column vector
Pc_i, Lh_i = pmodel.color_matrix(col_vec, np.asarray([label, ]), p_th=p_th)
colors = np.squeeze(Pc_i)
assert colors.ndim == 1
return colors
print(list(R[:, 3]))
print(color_vector(R[:, 3], label=1, p_th=p_threshold))
print(color_vector(R[:, 3], label=0, p_th=p_threshold))
matching_fn = uknn.estimate_labels_by_matching(fknn, R, Pc, p_threshold, verbose=1)
lh_knn = matching_fn(T)
acc_knn = np.sum(lh_knn == L_test) / (len(L_test)+0.0)
Pc_knn, Lh1 = pmodel.color_matrix(T, lh_knn, p_threshold) # Mc: Color matrix evaluated via estimated labels
Pf_knn = pmodel.to_preference(Pc_knn, neutral=0.0)
metrics = pmodel.eval_estimated_probability_filter(Pf_knn, T, L_test, p_threshold, eps=1e-3)
highlight("Guesstimated labeling (on T) via kNN")
print(f"> Labeling accuracy: {acc_knn}")
print(f"> Reliable-to-correct ratio: {metrics['p_overlap']}") # Fraction of entries predicted reliable and are actually correct (TPs or TNs)
# ... downside is that high accuracy could be due to TNs but not TPs
print(f"> Precision: {metrics['precision']}, Recall: {metrics['recall']}")
print(f"> Predcitio(TP): {metrics['precision_tp']}, Recall(TP): {metrics['recall_tp']} => f1(TP): {f_score(metrics['precision_tp'], metrics['recall_tp'])}")
print(f"> Error rate: {metrics['p_missed']}") # Probability of predicting reliable but hitting either FPs or FNs
[0.5005034693335313, 0.0, 0.08009972560479113, 4.519121183621038e-16, 0.0008115769903625958] [ 2. 2. 2. -1. -1.] [-2. -2. -2. 1. 1.] [info] Pc_i: [[-2 -2 2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 2 -2 -2 -2 -2 -2 -2 -2] [ 1 1 -1 1 1 1 1 1 1 1] [ 1 1 -1 1 1 1 1 1 1 1]] ... Label = 1 ... Color(ti): [-1. 2. 2. -1. -1.] ... N_matches(ti): 0 ...... sum distances: 9.2 ... Label = 0 ... Color(ti): [ 1. -2. -2. 1. 1.] ... N_matches(ti): 0 ...... sum distances: 2.8000000000000003 -------------------------------------------------- [info] Pc_i: [[-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 1 -2 1 -2] [ 1 1 1 1 1 1 1 1 1 1] [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]] ... Label = 1 ... Color(ti): [-1. 2. 2. -1. 2.] ... N_matches(ti): 0 ...... sum distances: 10.0 ... Label = 0 ... Color(ti): [ 1. -2. -2. 1. -2.] ... N_matches(ti): 0 ...... sum distances: 2.4000000000000004 -------------------------------------------------- [info] Pc_i: [[-2 -2 -2 2 -2 -2 -2 -2 -2 -2] [-2 -2 -2 2 -2 -2 -2 -2 -2 -2] [-2 -2 -2 2 -2 -2 -2 -2 -2 -2] [-2 -2 -2 2 -2 -2 -2 -2 -2 -2] [ 1 1 1 -1 1 1 1 1 1 1]] ... Label = 1 ... Color(ti): [-1. 2. 2. 2. -1.] ... N_matches(ti): 0 ...... sum distances: 9.2 ... Label = 0 ... Color(ti): [ 1. -2. -2. -2. 1.] ... N_matches(ti): 0 ...... sum distances: 2.8000000000000007 -------------------------------------------------- [info] Pc_i: [[-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [ 1 1 1 1 1 1 1 1 1 1] [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]] ... Label = 1 ... Color(ti): [-1. 2. 2. -1. 2.] ... N_matches(ti): 0 ...... sum distances: 10.0 ... Label = 0 ... Color(ti): [ 1. -2. -2. 1. -2.] ... N_matches(ti): 0 ...... sum distances: 1.9999999999999998 -------------------------------------------------- [info] Pc_i: [[-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [ 1 1 1 1 1 1 1 1 1 1] [ 1 1 1 1 1 1 1 1 1 1]] ... Label = 1 ... Color(ti): [-1. 2. 2. -1. -1.] ... N_matches(ti): 0 ...... sum distances: 10.0 ... Label = 0 ... Color(ti): [ 1. -2. -2. 1. 1.] ... N_matches(ti): 0 ...... sum distances: 1.9999999999999998 -------------------------------------------------- [info] Pc_i: [[-2 -2 -2 -2 -2 -2 -2 2 -2 -2] [-2 -2 -2 -2 -2 -2 -2 2 -2 -2] [-2 -2 -2 -2 -2 -2 -2 2 -2 -2] [ 1 1 1 1 1 1 1 -1 1 1] [-2 -2 -2 -2 -2 -2 -2 2 -2 -2]] ... Label = 1 ... Color(ti): [-1. 2. 2. -1. 2.] ... N_matches(ti): 0 ...... sum distances: 9.2 ... Label = 0 ... Color(ti): [ 1. -2. -2. 1. -2.] ... N_matches(ti): 0 ...... sum distances: 2.8000000000000003 -------------------------------------------------- [info] Pc_i: [[-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2] [ 1 1 1 1 1 1 1 -2 -2 1] [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]] ... Label = 1 ... Color(ti): [-1. 2. 2. -1. 2.] ... N_matches(ti): 0 ...... sum distances: 10.0 ... Label = 0 ... Color(ti): [ 1. -2. -2. 1. -2.] ... N_matches(ti): 0 ...... sum distances: 2.4 -------------------------------------------------- [info] Pc_i: [[-2 -2 -2 -2 -2 -2 -2 2 -2 -2] [-2 -2 -2 -2 -2 -2 -2 2 -2 -2] [-2 -2 -2 -2 -2 -2 -2 2 -2 -2] [ 1 1 1 1 1 1 1 -1 1 1] [ 1 1 1 1 1 1 1 -1 1 1]] ... Label = 1 ... Color(ti): [-1. 2. 2. -1. -1.] ... N_matches(ti): 0 ...... sum distances: 9.2 ... Label = 0 ... Color(ti): [ 1. -2. -2. 1. 1.] ... N_matches(ti): 0 ...... sum distances: 2.8000000000000003 -------------------------------------------------- [info] Pc_i: [[-2 -2 -2 2 -2 2 -2 -2 2 -2] [-2 -2 -2 2 -2 2 -2 -2 2 -2] [-2 -2 -2 2 -2 2 -2 -2 2 -2] [ 1 1 1 -1 1 -1 1 1 -1 1] [ 1 1 1 -1 1 -1 1 1 -1 1]] ... Label = 1 ... Color(ti): [-1. 2. 2. 2. -1.] ... N_matches(ti): 0 ...... sum distances: 8.200000000000001 ... Label = 0 ... Color(ti): [ 1. -2. -2. -2. 1.] ... N_matches(ti): 0 ...... sum distances: 5.800000000000001 -------------------------------------------------- [info] Pc_i: [[-2 -2 2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 2 -2 -2 -2 -2 -2 -2 -2] [-2 -2 2 -2 -2 -2 -2 -2 -2 -2] [ 1 1 -1 1 1 1 1 1 1 1] [ 1 1 -1 1 1 1 1 1 1 1]] ... Label = 1 ... Color(ti): [-1. 2. 2. -1. -1.] ... N_matches(ti): 0 ...... sum distances: 9.2 ... Label = 0 ... Color(ti): [ 1. -2. -2. 1. 1.] ... N_matches(ti): 0 ...... sum distances: 2.8000000000000003 -------------------------------------------------- ================================================================================ Guesstimated labeling (on T) via kNN ================================================================================ > Labeling accuracy: 0.9008 > Reliable-to-correct ratio: 0.9008 > Precision: 0.901522221729115, Recall: 0.8980313333232225 > Predcitio(TP): 0.018140583692716653, Recall(TP): 0.15342423719387072 => f1(TP): 0.03244494064603671 > Error rate: 3.19006875384525e-05
Use kNN-estimated labeling and color matrix as a probability filter¶
A. Prepare the data
In [ ]:
Copied!
import utils_stacking as ustk
assert Pc.shape == R.shape, "At this point the `Pc` should only hold the colors for R"
R = train_split.X
L_train = train_split.L
T = test_split.X
# L = np.hstack([L_train, lh_knn]) # Note that the labeling for the test split is just an estimate (previously we used majority vote; here we use kNNs)
Pc, C0, Cw, Cn, *rest = uc.eval_confidence_given_color_matrix(
X=(R, T),
L=(L_train, lh_knn),
Pc=(Pc, Pc_knn),
# n_train = R.shape[1], # if (R, T) is passed instead of an already-merged X, then n_train is not needed
alpha=alpha,
p_threshold=p_threshold,
conf_measure=conf_measure,
policy_threshold=policy_threshold)
ustk.verify_shape(X, R, T, L, U, p_threshold) # verify the shape of all key quantities
# Check: Wherever Pc is negative, the corresponding entries in Cn must be 0 (By constrast, C is a full/dense confidence matrix)
assert np.all(Cn[Pc < 0]==0)
assert np.all(Cn[Pc > 0]>0)
X = np.hstack([R, T])
L = np.hstack([L_train, lh_knn])
import utils_stacking as ustk
assert Pc.shape == R.shape, "At this point the `Pc` should only hold the colors for R"
R = train_split.X
L_train = train_split.L
T = test_split.X
# L = np.hstack([L_train, lh_knn]) # Note that the labeling for the test split is just an estimate (previously we used majority vote; here we use kNNs)
Pc, C0, Cw, Cn, *rest = uc.eval_confidence_given_color_matrix(
X=(R, T),
L=(L_train, lh_knn),
Pc=(Pc, Pc_knn),
# n_train = R.shape[1], # if (R, T) is passed instead of an already-merged X, then n_train is not needed
alpha=alpha,
p_threshold=p_threshold,
conf_measure=conf_measure,
policy_threshold=policy_threshold)
ustk.verify_shape(X, R, T, L, U, p_threshold) # verify the shape of all key quantities
# Check: Wherever Pc is negative, the corresponding entries in Cn must be 0 (By constrast, C is a full/dense confidence matrix)
assert np.all(Cn[Pc < 0]==0)
assert np.all(Cn[Pc > 0]>0)
X = np.hstack([R, T])
L = np.hstack([L_train, lh_knn])
(make_cn) Using WEIGHTED confidence matrix to approximate ratings ...
Train CFNet with the new data¶
Rstays the same- Quantities associated with
T(e.g.lh_knn,Pc_knn) are merged with their counterparts associated with training split
In [ ]:
Copied!
import cf_models as cm
n_users, n_items = X.shape
fold_number = 0
test_size = 0.1
policy_threshold = 'fmax'
conf_measure = 'brier'
n_factors = 100
alpha = 100
lr = 0.001
batch_size = 64
epochs = 200
loss_fn = tf.keras.losses.BinaryCrossentropy() # Options: tf.keras.losses.BinaryCrossentropy(), tf.keras.losses.MeanSquaredError(), ...
# Configure `target_type` (Options: 'generic', 'rating', 'label')
# 1. Choose 'label' if the BCE loss is used (because the CF model in this case attempts to approximates the label encoded in 0 and 1)
# 2. Choose 'rating' if MSE is used (because the CF model in this case approximates the rating, which is a regression problem)
# 3. Choose 'generic' for customized loss function with potentially more complex labeling information where "y_true" is a matrix
#
# Note that you are unlikely need to configure `target_type` because cf_models module has a method that will determine this for you automatically
# target_type = 'label' # if we use BCE, then the model approximates the label
cf_model = cm.get_cfnet_compiled(n_users, n_items, n_factors, loss=loss_fn, lr=lr)
# cf_model = cm.get_cfnet_approximating_labels(n_users, n_items, n_factors)
cf_model = cm.training_pipeline(input_model=(cf_model, loss_fn),
input_data=(R, T, U, L_train, L_test),
# Should we combine R and T into a single matrix X? Set to True if so
is_cascade = True, # Set to True here to combine R and T into X
lh = lh_knn, # supply the pre-computed, kNN-based estimated labels for T
# SGD optimization parameters
test_size = test_size,
epochs = epochs,
batch_size=batch_size,
# CF hyperparameters
# n_factors=n_factors, # this is factored into model definition
alpha=alpha,
conf_measure=conf_measure,
# conf_type='Cn', # default sparse confidence matrix (Cn)
# target_type=target_type,
policy_threshold=policy_threshold,
fold_number=fold_number)
import cf_models as cm
n_users, n_items = X.shape
fold_number = 0
test_size = 0.1
policy_threshold = 'fmax'
conf_measure = 'brier'
n_factors = 100
alpha = 100
lr = 0.001
batch_size = 64
epochs = 200
loss_fn = tf.keras.losses.BinaryCrossentropy() # Options: tf.keras.losses.BinaryCrossentropy(), tf.keras.losses.MeanSquaredError(), ...
# Configure `target_type` (Options: 'generic', 'rating', 'label')
# 1. Choose 'label' if the BCE loss is used (because the CF model in this case attempts to approximates the label encoded in 0 and 1)
# 2. Choose 'rating' if MSE is used (because the CF model in this case approximates the rating, which is a regression problem)
# 3. Choose 'generic' for customized loss function with potentially more complex labeling information where "y_true" is a matrix
#
# Note that you are unlikely need to configure `target_type` because cf_models module has a method that will determine this for you automatically
# target_type = 'label' # if we use BCE, then the model approximates the label
cf_model = cm.get_cfnet_compiled(n_users, n_items, n_factors, loss=loss_fn, lr=lr)
# cf_model = cm.get_cfnet_approximating_labels(n_users, n_items, n_factors)
cf_model = cm.training_pipeline(input_model=(cf_model, loss_fn),
input_data=(R, T, U, L_train, L_test),
# Should we combine R and T into a single matrix X? Set to True if so
is_cascade = True, # Set to True here to combine R and T into X
lh = lh_knn, # supply the pre-computed, kNN-based estimated labels for T
# SGD optimization parameters
test_size = test_size,
epochs = epochs,
batch_size=batch_size,
# CF hyperparameters
# n_factors=n_factors, # this is factored into model definition
alpha=alpha,
conf_measure=conf_measure,
# conf_type='Cn', # default sparse confidence matrix (Cn)
# target_type=target_type,
policy_threshold=policy_threshold,
fold_number=fold_number)
[merge] Merging 'L_train' and 'lh': len(L_train): 3750 || len(lh): 1250 => len(L): 5000 [merge] Merging 'R' and 'T': shape(R):(5, 3750) || shape(T): (5, 1250) => shape(X): (5, 5000) (make_cn) Using WEIGHTED confidence matrix to approximate ratings ... [info] Confidence matrix type: Cn, target data type: label Epoch 1/200 352/352 [==============================] - 4s 8ms/step - loss: 2.7591 - val_loss: 3.3721 Epoch 2/200 352/352 [==============================] - 2s 7ms/step - loss: 2.2233 - val_loss: 1.9442 Epoch 3/200 352/352 [==============================] - 2s 7ms/step - loss: 2.1033 - val_loss: 2.1897 Epoch 4/200 352/352 [==============================] - 2s 7ms/step - loss: 1.1302 - val_loss: 1.1313 Epoch 5/200 352/352 [==============================] - 2s 7ms/step - loss: 0.7799 - val_loss: 1.0086 Epoch 6/200 352/352 [==============================] - 2s 6ms/step - loss: 0.6457 - val_loss: 0.9135 Epoch 7/200 352/352 [==============================] - 2s 7ms/step - loss: 0.5857 - val_loss: 0.8766 Epoch 8/200 352/352 [==============================] - 2s 7ms/step - loss: 0.5545 - val_loss: 0.8564 Epoch 9/200 352/352 [==============================] - 2s 7ms/step - loss: 0.5353 - val_loss: 0.8500 Epoch 10/200 352/352 [==============================] - 3s 7ms/step - loss: 0.5183 - val_loss: 0.8246 Epoch 11/200 352/352 [==============================] - 2s 7ms/step - loss: 0.5021 - val_loss: 0.8175 Epoch 12/200 352/352 [==============================] - 2s 7ms/step - loss: 0.4950 - val_loss: 0.7812 Epoch 13/200 352/352 [==============================] - 2s 6ms/step - loss: 0.4759 - val_loss: 0.7763 Epoch 14/200 352/352 [==============================] - 2s 7ms/step - loss: 0.4656 - val_loss: 0.7545 Epoch 15/200 352/352 [==============================] - 2s 6ms/step - loss: 0.4494 - val_loss: 0.7488 Epoch 16/200 352/352 [==============================] - 2s 7ms/step - loss: 0.4357 - val_loss: 0.7209 Epoch 17/200 352/352 [==============================] - 3s 7ms/step - loss: 0.4198 - val_loss: 0.7068 Epoch 18/200 352/352 [==============================] - 2s 7ms/step - loss: 0.4044 - val_loss: 0.6898 Epoch 19/200 352/352 [==============================] - 3s 7ms/step - loss: 0.3898 - val_loss: 0.6753 Epoch 20/200 352/352 [==============================] - 2s 7ms/step - loss: 0.3787 - val_loss: 0.6564 Epoch 21/200 352/352 [==============================] - 3s 7ms/step - loss: 0.3645 - val_loss: 0.6430 Epoch 22/200 352/352 [==============================] - 2s 7ms/step - loss: 0.3536 - val_loss: 0.6318 Epoch 23/200 352/352 [==============================] - 2s 7ms/step - loss: 0.3447 - val_loss: 0.6222 Epoch 24/200 352/352 [==============================] - 3s 7ms/step - loss: 0.3371 - val_loss: 0.6135 Epoch 25/200 352/352 [==============================] - 2s 7ms/step - loss: 0.3302 - val_loss: 0.6052 Epoch 26/200 352/352 [==============================] - 2s 7ms/step - loss: 0.3238 - val_loss: 0.5967 Epoch 27/200 352/352 [==============================] - 2s 7ms/step - loss: 0.3176 - val_loss: 0.5886 Epoch 28/200 352/352 [==============================] - 2s 6ms/step - loss: 0.3117 - val_loss: 0.5802 Epoch 29/200 352/352 [==============================] - 2s 7ms/step - loss: 0.3060 - val_loss: 0.5720 Epoch 30/200 352/352 [==============================] - 2s 7ms/step - loss: 0.3004 - val_loss: 0.5641 Epoch 31/200 352/352 [==============================] - 2s 7ms/step - loss: 0.2949 - val_loss: 0.5556 Epoch 32/200 352/352 [==============================] - 2s 7ms/step - loss: 0.2895 - val_loss: 0.5476 Epoch 33/200 352/352 [==============================] - 2s 6ms/step - loss: 0.2843 - val_loss: 0.5397 Epoch 34/200 352/352 [==============================] - 2s 6ms/step - loss: 0.2795 - val_loss: 0.5318 Epoch 35/200 352/352 [==============================] - 2s 7ms/step - loss: 0.2742 - val_loss: 0.5241 Epoch 36/200 352/352 [==============================] - 3s 8ms/step - loss: 0.2696 - val_loss: 0.5160 Epoch 37/200 352/352 [==============================] - 2s 7ms/step - loss: 0.2646 - val_loss: 0.5082 Epoch 38/200 352/352 [==============================] - 3s 7ms/step - loss: 0.2603 - val_loss: 0.5011 Epoch 39/200 352/352 [==============================] - 2s 7ms/step - loss: 0.2557 - val_loss: 0.4947 Epoch 40/200 352/352 [==============================] - 2s 6ms/step - loss: 0.2514 - val_loss: 0.4852 Epoch 41/200 352/352 [==============================] - 2s 7ms/step - loss: 0.2462 - val_loss: 0.4789 Epoch 42/200 352/352 [==============================] - 2s 6ms/step - loss: 0.2419 - val_loss: 0.4703 Epoch 43/200 352/352 [==============================] - 2s 7ms/step - loss: 0.2383 - val_loss: 0.4630 Epoch 44/200 352/352 [==============================] - 2s 7ms/step - loss: 0.2329 - val_loss: 0.4575 Epoch 45/200 352/352 [==============================] - 2s 6ms/step - loss: 0.2301 - val_loss: 0.4487 Epoch 46/200 352/352 [==============================] - 2s 7ms/step - loss: 0.2245 - val_loss: 0.4418 Epoch 47/200 352/352 [==============================] - 2s 7ms/step - loss: 0.2210 - val_loss: 0.4347 Epoch 48/200 352/352 [==============================] - 2s 7ms/step - loss: 0.2168 - val_loss: 0.4273 Epoch 49/200 352/352 [==============================] - 2s 7ms/step - loss: 0.2118 - val_loss: 0.4206 Epoch 50/200 352/352 [==============================] - 3s 7ms/step - loss: 0.2100 - val_loss: 0.4197 Epoch 51/200 352/352 [==============================] - 2s 7ms/step - loss: 0.2050 - val_loss: 0.4073 Epoch 52/200 352/352 [==============================] - 2s 7ms/step - loss: 0.2003 - val_loss: 0.4007 Epoch 53/200 352/352 [==============================] - 2s 7ms/step - loss: 0.1967 - val_loss: 0.3942 Epoch 54/200 352/352 [==============================] - 2s 7ms/step - loss: 0.1939 - val_loss: 0.3879 Epoch 55/200 352/352 [==============================] - 2s 7ms/step - loss: 0.1895 - val_loss: 0.3826 Epoch 56/200 352/352 [==============================] - 2s 7ms/step - loss: 0.1869 - val_loss: 0.3764 Epoch 57/200 352/352 [==============================] - 3s 7ms/step - loss: 0.1824 - val_loss: 0.3693 Epoch 58/200 352/352 [==============================] - 3s 7ms/step - loss: 0.1790 - val_loss: 0.3638 Epoch 59/200 352/352 [==============================] - 2s 7ms/step - loss: 0.1763 - val_loss: 0.3575 Epoch 60/200 352/352 [==============================] - 2s 7ms/step - loss: 0.1728 - val_loss: 0.3518 Epoch 61/200 352/352 [==============================] - 2s 7ms/step - loss: 0.1687 - val_loss: 0.3460 Epoch 62/200 352/352 [==============================] - 2s 7ms/step - loss: 0.1656 - val_loss: 0.3407 Epoch 63/200 352/352 [==============================] - 2s 7ms/step - loss: 0.1627 - val_loss: 0.3349 Epoch 64/200 352/352 [==============================] - 3s 7ms/step - loss: 0.1602 - val_loss: 0.3319 Epoch 65/200 352/352 [==============================] - 2s 7ms/step - loss: 0.1570 - val_loss: 0.3244 Epoch 66/200 352/352 [==============================] - 2s 7ms/step - loss: 0.1533 - val_loss: 0.3189 Epoch 67/200 352/352 [==============================] - 2s 7ms/step - loss: 0.1502 - val_loss: 0.3138 Epoch 68/200 352/352 [==============================] - 2s 7ms/step - loss: 0.1486 - val_loss: 0.3087 Epoch 69/200 352/352 [==============================] - 2s 7ms/step - loss: 0.1445 - val_loss: 0.3039 Epoch 70/200 352/352 [==============================] - 2s 7ms/step - loss: 0.1419 - val_loss: 0.2995 Epoch 71/200 352/352 [==============================] - 2s 6ms/step - loss: 0.1391 - val_loss: 0.2939 Epoch 72/200 352/352 [==============================] - 3s 7ms/step - loss: 0.1366 - val_loss: 0.2892 Epoch 73/200 352/352 [==============================] - 3s 7ms/step - loss: 0.1337 - val_loss: 0.2843 Epoch 74/200 352/352 [==============================] - 3s 8ms/step - loss: 0.1310 - val_loss: 0.2794 Epoch 75/200 352/352 [==============================] - 2s 6ms/step - loss: 0.1285 - val_loss: 0.2749 Epoch 76/200 352/352 [==============================] - 2s 6ms/step - loss: 0.1263 - val_loss: 0.2711 Epoch 77/200 352/352 [==============================] - 3s 7ms/step - loss: 0.1241 - val_loss: 0.2661 Epoch 78/200 352/352 [==============================] - 3s 7ms/step - loss: 0.1211 - val_loss: 0.2617 Epoch 79/200 352/352 [==============================] - 3s 7ms/step - loss: 0.1187 - val_loss: 0.2578 Epoch 80/200 352/352 [==============================] - 3s 9ms/step - loss: 0.1164 - val_loss: 0.2536 Epoch 81/200 352/352 [==============================] - 2s 6ms/step - loss: 0.1145 - val_loss: 0.2523 Epoch 82/200 352/352 [==============================] - 2s 6ms/step - loss: 0.1123 - val_loss: 0.2452 Epoch 83/200 352/352 [==============================] - 3s 8ms/step - loss: 0.1095 - val_loss: 0.2412 Epoch 84/200 352/352 [==============================] - 2s 7ms/step - loss: 0.1074 - val_loss: 0.2372 Epoch 85/200 352/352 [==============================] - 3s 8ms/step - loss: 0.1053 - val_loss: 0.2332 Epoch 86/200 352/352 [==============================] - 3s 7ms/step - loss: 0.1033 - val_loss: 0.2302 Epoch 87/200 352/352 [==============================] - 3s 7ms/step - loss: 0.1013 - val_loss: 0.2259 Epoch 88/200 352/352 [==============================] - 2s 7ms/step - loss: 0.0993 - val_loss: 0.2222 Epoch 89/200 352/352 [==============================] - 2s 7ms/step - loss: 0.0975 - val_loss: 0.2187 Epoch 90/200 352/352 [==============================] - 3s 8ms/step - loss: 0.0952 - val_loss: 0.2147 Epoch 91/200 352/352 [==============================] - 2s 6ms/step - loss: 0.0933 - val_loss: 0.2113 Epoch 92/200 352/352 [==============================] - 2s 7ms/step - loss: 0.0915 - val_loss: 0.2077 Epoch 93/200 352/352 [==============================] - 3s 7ms/step - loss: 0.0896 - val_loss: 0.2043 Epoch 94/200 352/352 [==============================] - 2s 6ms/step - loss: 0.0879 - val_loss: 0.2011 Epoch 95/200 352/352 [==============================] - 3s 7ms/step - loss: 0.0862 - val_loss: 0.1979 Epoch 96/200 352/352 [==============================] - 2s 7ms/step - loss: 0.0845 - val_loss: 0.1946 Epoch 97/200 352/352 [==============================] - 3s 7ms/step - loss: 0.0828 - val_loss: 0.1913 Epoch 98/200 352/352 [==============================] - 3s 7ms/step - loss: 0.0810 - val_loss: 0.1881 Epoch 99/200 352/352 [==============================] - 3s 7ms/step - loss: 0.0793 - val_loss: 0.1850 Epoch 100/200 352/352 [==============================] - 3s 7ms/step - loss: 0.0778 - val_loss: 0.1822 Epoch 101/200 352/352 [==============================] - 2s 7ms/step - loss: 0.0764 - val_loss: 0.1794 Epoch 102/200 352/352 [==============================] - 2s 7ms/step - loss: 0.0747 - val_loss: 0.1763 Epoch 103/200 352/352 [==============================] - 3s 7ms/step - loss: 0.0732 - val_loss: 0.1735 Epoch 104/200 352/352 [==============================] - 3s 7ms/step - loss: 0.0717 - val_loss: 0.1707 Epoch 105/200 352/352 [==============================] - 3s 7ms/step - loss: 0.0703 - val_loss: 0.1680 Epoch 106/200 352/352 [==============================] - 3s 8ms/step - loss: 0.0689 - val_loss: 0.1653 Epoch 107/200 352/352 [==============================] - 2s 7ms/step - loss: 0.0675 - val_loss: 0.1625 Epoch 108/200 352/352 [==============================] - 3s 8ms/step - loss: 0.0660 - val_loss: 0.1600 Epoch 109/200 352/352 [==============================] - 2s 7ms/step - loss: 0.0647 - val_loss: 0.1572 Epoch 110/200 352/352 [==============================] - 3s 7ms/step - loss: 0.0635 - val_loss: 0.1547 Epoch 111/200 352/352 [==============================] - 3s 8ms/step - loss: 0.0621 - val_loss: 0.1522 Epoch 112/200 352/352 [==============================] - 3s 7ms/step - loss: 0.0609 - val_loss: 0.1497 Epoch 113/200 352/352 [==============================] - 2s 7ms/step - loss: 0.0597 - val_loss: 0.1475 Epoch 114/200 352/352 [==============================] - 2s 7ms/step - loss: 0.0584 - val_loss: 0.1449 Epoch 115/200 352/352 [==============================] - 3s 7ms/step - loss: 0.0572 - val_loss: 0.1426 Epoch 116/200 352/352 [==============================] - 3s 7ms/step - loss: 0.0560 - val_loss: 0.1403 Epoch 117/200 352/352 [==============================] - 3s 7ms/step - loss: 0.0549 - val_loss: 0.1384 Epoch 118/200 352/352 [==============================] - 2s 7ms/step - loss: 0.0538 - val_loss: 0.1359 Epoch 119/200 352/352 [==============================] - 2s 7ms/step - loss: 0.0527 - val_loss: 0.1337 Epoch 120/200 352/352 [==============================] - 2s 7ms/step - loss: 0.0516 - val_loss: 0.1316 Epoch 121/200 352/352 [==============================] - 3s 7ms/step - loss: 0.0506 - val_loss: 0.1297 Epoch 122/200 352/352 [==============================] - 3s 7ms/step - loss: 0.0495 - val_loss: 0.1275 Epoch 123/200 352/352 [==============================] - 3s 8ms/step - loss: 0.0485 - val_loss: 0.1255 Epoch 124/200 352/352 [==============================] - 3s 7ms/step - loss: 0.0475 - val_loss: 0.1237 Epoch 125/200 352/352 [==============================] - 3s 7ms/step - loss: 0.0466 - val_loss: 0.1216 Epoch 126/200 352/352 [==============================] - 2s 7ms/step - loss: 0.0456 - val_loss: 0.1197 Epoch 127/200 352/352 [==============================] - 2s 7ms/step - loss: 0.0446 - val_loss: 0.1178 Epoch 128/200 352/352 [==============================] - 3s 7ms/step - loss: 0.0437 - val_loss: 0.1160 Epoch 129/200 352/352 [==============================] - 3s 7ms/step - loss: 0.0429 - val_loss: 0.1142 Epoch 130/200 352/352 [==============================] - 3s 7ms/step - loss: 0.0419 - val_loss: 0.1125 Epoch 131/200 352/352 [==============================] - 3s 7ms/step - loss: 0.0411 - val_loss: 0.1108 Epoch 132/200 352/352 [==============================] - 2s 7ms/step - loss: 0.0403 - val_loss: 0.1088 Epoch 133/200 352/352 [==============================] - 2s 7ms/step - loss: 0.0394 - val_loss: 0.1074 Epoch 134/200 352/352 [==============================] - 2s 7ms/step - loss: 0.0386 - val_loss: 0.1058 Epoch 135/200 352/352 [==============================] - 2s 7ms/step - loss: 0.0378 - val_loss: 0.1041 Epoch 136/200 352/352 [==============================] - 3s 7ms/step - loss: 0.0371 - val_loss: 0.1025 Epoch 137/200 352/352 [==============================] - 3s 7ms/step - loss: 0.0363 - val_loss: 0.1010 Epoch 138/200 352/352 [==============================] - 3s 7ms/step - loss: 0.0355 - val_loss: 0.0993 Epoch 139/200 352/352 [==============================] - 3s 7ms/step - loss: 0.0348 - val_loss: 0.0978 Epoch 140/200 352/352 [==============================] - 2s 7ms/step - loss: 0.0341 - val_loss: 0.0964 Epoch 141/200 352/352 [==============================] - 3s 7ms/step - loss: 0.0334 - val_loss: 0.0949 Epoch 142/200 352/352 [==============================] - 2s 7ms/step - loss: 0.0327 - val_loss: 0.0936 Epoch 143/200 352/352 [==============================] - 2s 6ms/step - loss: 0.0320 - val_loss: 0.0921 Epoch 144/200 352/352 [==============================] - 2s 7ms/step - loss: 0.0314 - val_loss: 0.0908 Epoch 145/200 352/352 [==============================] - 2s 7ms/step - loss: 0.0307 - val_loss: 0.0895 Epoch 146/200 352/352 [==============================] - 3s 7ms/step - loss: 0.0301 - val_loss: 0.0881 Epoch 147/200 352/352 [==============================] - 3s 7ms/step - loss: 0.0295 - val_loss: 0.0868 Epoch 148/200 352/352 [==============================] - 2s 6ms/step - loss: 0.0289 - val_loss: 0.0856 Epoch 149/200 352/352 [==============================] - 3s 8ms/step - loss: 0.0283 - val_loss: 0.0842 Epoch 150/200 352/352 [==============================] - 3s 7ms/step - loss: 0.0277 - val_loss: 0.0831 Epoch 151/200 352/352 [==============================] - 2s 7ms/step - loss: 0.0271 - val_loss: 0.0818 Epoch 152/200 352/352 [==============================] - 2s 7ms/step - loss: 0.0265 - val_loss: 0.0806 Epoch 153/200 352/352 [==============================] - 2s 6ms/step - loss: 0.0260 - val_loss: 0.0795 Epoch 154/200 352/352 [==============================] - 2s 7ms/step - loss: 0.0255 - val_loss: 0.0783 Epoch 155/200 352/352 [==============================] - 3s 7ms/step - loss: 0.0249 - val_loss: 0.0772 Epoch 156/200 352/352 [==============================] - 3s 8ms/step - loss: 0.0244 - val_loss: 0.0762 Epoch 157/200 352/352 [==============================] - 2s 7ms/step - loss: 0.0239 - val_loss: 0.0749 Epoch 158/200 352/352 [==============================] - 3s 7ms/step - loss: 0.0234 - val_loss: 0.0740 Epoch 159/200 352/352 [==============================] - 2s 7ms/step - loss: 0.0230 - val_loss: 0.0730 Epoch 160/200 352/352 [==============================] - 3s 7ms/step - loss: 0.0225 - val_loss: 0.0719 Epoch 161/200 352/352 [==============================] - 2s 7ms/step - loss: 0.0220 - val_loss: 0.0709 Epoch 162/200 352/352 [==============================] - 3s 7ms/step - loss: 0.0216 - val_loss: 0.0699 Epoch 163/200 352/352 [==============================] - 2s 6ms/step - loss: 0.0211 - val_loss: 0.0689 Epoch 164/200 352/352 [==============================] - 3s 7ms/step - loss: 0.0207 - val_loss: 0.0680 Epoch 165/200 352/352 [==============================] - 3s 7ms/step - loss: 0.0202 - val_loss: 0.0671 Epoch 166/200 352/352 [==============================] - 3s 7ms/step - loss: 0.0198 - val_loss: 0.0662 Epoch 167/200 352/352 [==============================] - 2s 7ms/step - loss: 0.0194 - val_loss: 0.0652 Epoch 168/200 352/352 [==============================] - 3s 7ms/step - loss: 0.0190 - val_loss: 0.0644 Epoch 169/200 352/352 [==============================] - 2s 7ms/step - loss: 0.0186 - val_loss: 0.0635 Epoch 170/200 352/352 [==============================] - 2s 7ms/step - loss: 0.0182 - val_loss: 0.0627 Epoch 171/200 352/352 [==============================] - 2s 7ms/step - loss: 0.0179 - val_loss: 0.0618 Epoch 172/200 352/352 [==============================] - 2s 6ms/step - loss: 0.0175 - val_loss: 0.0610 Epoch 173/200 352/352 [==============================] - 2s 7ms/step - loss: 0.0171 - val_loss: 0.0602 Epoch 174/200 352/352 [==============================] - 2s 7ms/step - loss: 0.0168 - val_loss: 0.0594 Epoch 175/200 352/352 [==============================] - 2s 6ms/step - loss: 0.0164 - val_loss: 0.0586 Epoch 176/200 352/352 [==============================] - 2s 7ms/step - loss: 0.0161 - val_loss: 0.0579 Epoch 177/200 352/352 [==============================] - 3s 7ms/step - loss: 0.0158 - val_loss: 0.0571 Epoch 178/200 352/352 [==============================] - 2s 6ms/step - loss: 0.0154 - val_loss: 0.0563 Epoch 179/200 352/352 [==============================] - 2s 7ms/step - loss: 0.0151 - val_loss: 0.0556 Epoch 180/200 352/352 [==============================] - 3s 7ms/step - loss: 0.0148 - val_loss: 0.0549 Epoch 181/200 352/352 [==============================] - 3s 7ms/step - loss: 0.0145 - val_loss: 0.0541 Epoch 182/200 352/352 [==============================] - 2s 7ms/step - loss: 0.0142 - val_loss: 0.0534 Epoch 183/200 352/352 [==============================] - 3s 7ms/step - loss: 0.0139 - val_loss: 0.0528 Epoch 184/200 352/352 [==============================] - 2s 7ms/step - loss: 0.0136 - val_loss: 0.0522 Epoch 185/200 352/352 [==============================] - 2s 7ms/step - loss: 0.0133 - val_loss: 0.0515 Epoch 186/200 352/352 [==============================] - 2s 7ms/step - loss: 0.0131 - val_loss: 0.0508 Epoch 187/200 352/352 [==============================] - 3s 7ms/step - loss: 0.0128 - val_loss: 0.0502 Epoch 188/200 352/352 [==============================] - 2s 7ms/step - loss: 0.0125 - val_loss: 0.0495 Epoch 189/200 352/352 [==============================] - 3s 7ms/step - loss: 0.0123 - val_loss: 0.0490 Epoch 190/200 352/352 [==============================] - 3s 7ms/step - loss: 0.0120 - val_loss: 0.0484 Epoch 191/200 352/352 [==============================] - 2s 7ms/step - loss: 0.0118 - val_loss: 0.0478 Epoch 192/200 352/352 [==============================] - 3s 7ms/step - loss: 0.0115 - val_loss: 0.0472 Epoch 193/200 352/352 [==============================] - 3s 7ms/step - loss: 0.0113 - val_loss: 0.0467 Epoch 194/200 352/352 [==============================] - 3s 7ms/step - loss: 0.0111 - val_loss: 0.0461 Epoch 195/200 352/352 [==============================] - 3s 7ms/step - loss: 0.0108 - val_loss: 0.0456 Epoch 196/200 352/352 [==============================] - 3s 8ms/step - loss: 0.0106 - val_loss: 0.0450 Epoch 197/200 352/352 [==============================] - 2s 7ms/step - loss: 0.0104 - val_loss: 0.0445 Epoch 198/200 352/352 [==============================] - 2s 6ms/step - loss: 0.0102 - val_loss: 0.0441 Epoch 199/200 352/352 [==============================] - 3s 7ms/step - loss: 0.0100 - val_loss: 0.0436 Epoch 200/200 352/352 [==============================] - 3s 7ms/step - loss: 0.0098 - val_loss: 0.0430
In [ ]:
Copied!
analyzer = cm.analyze_reconstruction(cf_model, X, L, Pc, n_train, p_threshold=p_threshold, policy_threshold=policy_threshold)
highlight("(BCE) Reestimate the entire rating matrix (X) with learned latent factors/embeddings")
reestimated = analyzer(L_test, unreliable_only=False)
highlight("(BCE) Reestimate ONLY the unreliable entries in X with learned latent factors/embeddings")
reestimated = analyzer(L_test, unreliable_only=True)
analyzer = cm.analyze_reconstruction(cf_model, X, L, Pc, n_train, p_threshold=p_threshold, policy_threshold=policy_threshold)
highlight("(BCE) Reestimate the entire rating matrix (X) with learned latent factors/embeddings")
reestimated = analyzer(L_test, unreliable_only=False)
highlight("(BCE) Reestimate ONLY the unreliable entries in X with learned latent factors/embeddings")
reestimated = analyzer(L_test, unreliable_only=True)
================================================================================ (BCE) Reestimate the entire rating matrix (X) with learned latent factors/embeddings ================================================================================ [info] From R to Rh, delta(Frobenius norm)= 74.72180508554011 [info] From T to Th, delta(Frobenius norm)= 38.623946721693805 [info] How different are lh and lh_new? 0.4576 [result] Majority vote: F1 score with the original T: 0.20470262793914248 [result] Majority vote: F1 score with re-estimated Th using original p_threshold: 0.19364161849710984 [result] Majority vote: F1 score with re-estimated Th: 0.18543046357615894 [result] Stacking: F1 score with the original T: 0.125 [result] Stacking: F1 score with re-estimated Th: 0.1842105263157895 [result] Best settings (complete): lh_maxvote, score: 0.20470262793914248 ================================================================================ (BCE) Reestimate ONLY the unreliable entries in X with learned latent factors/embeddings ================================================================================ [info] From R to Rh, delta(Frobenius norm)= 68.85513758278863 [info] From T to Th, delta(Frobenius norm)= 34.85557728925823 [info] How different are lh and lh_new? 0.4584 [result] Majority vote: F1 score with the original T: 0.20470262793914248 [result] Majority vote: F1 score with re-estimated Th using original p_threshold: 0.2 [result] Majority vote: F1 score with re-estimated Th: 0.1842105263157895 [result] Stacking: F1 score with the original T: 0.125 [result] Stacking: F1 score with re-estimated Th: 0.1842105263157895 [result] Best settings (unreliable only): lh_maxvote, score: 0.20470262793914248
In [ ]:
Copied!