kNN Ensemble

Introduction¶

Contining on Part 1 and Part 2 of this demo series, we will focus on using kNN-based methods to re-estimate T (or the rating matrix associated with the test split) and subsequently, use the reestiamted T (Th) as the basis for label predictions.

Reference¶

Make kNN 300 times faster
- Faiss: Metric type and distances
- Summary of methods

In [ ]:

Copied!





import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import os, sys

# Colab 
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False

# Plotting
import matplotlib.pylab as plt
# %matplotlib inline

from matplotlib.pyplot import figure
import seaborn as sns
from IPython.display import display

# Progress
from tqdm import tqdm

################################################################
# Configure system environment
# - Please modify input_dir according to your local enviornment
#
################################################################

cur_dir = os.getcwd()
project_dir = 'machine_learning_examples/cf_ensemble'
if IN_COLAB: 
    # Run this demo on Google Colab
    from google.colab import drive
    drive.mount('/content/drive')
    
    # Parameters for data
    input_dir = f"/content/drive/MyDrive/Colab Notebooks/{project_dir}"
    # /content/drive/MyDrive/Colab Notebooks/machine_learning_examples/data/data-is-life

    sys.path.append(input_dir)
else: 
    input_dir = cur_dir
    
if input_dir != cur_dir: 
    sys.path.append(input_dir)
    print(f"> Adding {input_dir} to sys path ...")
    print(sys.path)
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import os, sys

# Colab 
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False

# Plotting
import matplotlib.pylab as plt
# %matplotlib inline

from matplotlib.pyplot import figure
import seaborn as sns
from IPython.display import display

# Progress
from tqdm import tqdm

################################################################
# Configure system environment
# - Please modify input_dir according to your local enviornment
#
################################################################

cur_dir = os.getcwd()
project_dir = 'machine_learning_examples/cf_ensemble'
if IN_COLAB: 
    # Run this demo on Google Colab
    from google.colab import drive
    drive.mount('/content/drive')
    
    # Parameters for data
    input_dir = f"/content/drive/MyDrive/Colab Notebooks/{project_dir}"
    # /content/drive/MyDrive/Colab Notebooks/machine_learning_examples/data/data-is-life

    sys.path.append(input_dir)
else: 
    input_dir = cur_dir
    
if input_dir != cur_dir: 
    sys.path.append(input_dir)
    print(f"> Adding {input_dir} to sys path ...")
    print(sys.path)

Mounted at /content/drive
> Adding /content/drive/MyDrive/Colab Notebooks/machine_learning_examples/cf_ensemble to sys path ...
['', '/content', '/env/python', '/usr/lib/python37.zip', '/usr/lib/python3.7', '/usr/lib/python3.7/lib-dynload', '/usr/local/lib/python3.7/dist-packages', '/usr/lib/python3/dist-packages', '/usr/local/lib/python3.7/dist-packages/IPython/extensions', '/root/.ipython', '/content/drive/MyDrive/Colab Notebooks/machine_learning_examples/cf_ensemble', '/content/drive/MyDrive/Colab Notebooks/machine_learning_examples/cf_ensemble']

In [ ]:

Copied!





# Tensorflow
import tensorflow as tf
print(tf.__version__)
# import tensorflow_probability as tfp
# tfd = tfp.distributions
from tensorflow import keras

# from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Flatten, Dense, Dropout, Lambda, Embedding
from tensorflow.keras.optimizers import RMSprop
from keras.utils.vis_utils import plot_model
from tensorflow.keras import backend as K
#################################################################

# Scikit-learn 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_val_predict, cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, StackingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
#################################################################

# CF-ensemble-specific libraries
import utils_stacking as ustk
import utils_classifier as uclf
import utils_sys as usys
import utils_cf as uc 
import polarity_models as pmodel
from polarity_models import Polarity
import scipy.sparse as sparse
from utils_sys import highlight
#################################################################

# Misc
import pprint
import tempfile
from typing import Dict, Text

np.set_printoptions(precision=3, edgeitems=5, suppress=True)
# Tensorflow
import tensorflow as tf
print(tf.__version__)
# import tensorflow_probability as tfp
# tfd = tfp.distributions
from tensorflow import keras

# from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Flatten, Dense, Dropout, Lambda, Embedding
from tensorflow.keras.optimizers import RMSprop
from keras.utils.vis_utils import plot_model
from tensorflow.keras import backend as K
#################################################################

# Scikit-learn 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_val_predict, cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, StackingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
#################################################################

# CF-ensemble-specific libraries
import utils_stacking as ustk
import utils_classifier as uclf
import utils_sys as usys
import utils_cf as uc 
import polarity_models as pmodel
from polarity_models import Polarity
import scipy.sparse as sparse
from utils_sys import highlight
#################################################################

# Misc
import pprint
import tempfile
from typing import Dict, Text

np.set_printoptions(precision=3, edgeitems=5, suppress=True)

2.8.0

Generating training data¶

In [ ]:

Copied!

# %matplotlib inline
import data_pipeline as dp

max_class_ratio=0.99

# get the dataset
X0, y0 = dp.generate_imbalanced_data(class_ratio=max_class_ratio, verbose=1)
# %matplotlib inline
import data_pipeline as dp

max_class_ratio=0.99

# get the dataset
X0, y0 = dp.generate_imbalanced_data(class_ratio=max_class_ratio, verbose=1)

> n_classes: 2
[0 1]

> counts:
Counter({0: 4465, 1: 535})

Choosing base classifiers¶

In [ ]:

Copied!





# Create Base Learners
base_learners = [
                 ('RF', RandomForestClassifier(n_estimators= 200, 
                                                   oob_score = True, 
                                                   class_weight = "balanced", 
                                                   random_state = 20, 
                                                   ccp_alpha = 0.1)), 
                 ('KNNC', KNeighborsClassifier(n_neighbors = len(np.unique(y0))
                                                     , weights = 'distance')),
                #  ('SVC', SVC(kernel = 'linear', probability=True,
                #                    class_weight = 'balanced'
                #                   , break_ties = True)), 

                 ('GNB', GaussianNB()), 
                 ('QDA',  QuadraticDiscriminantAnalysis()), 
                 ('MLPClassifier', MLPClassifier(alpha=1, max_iter=1000)), 
                 # ('DT', DecisionTreeClassifier(max_depth=5)),
                 # ('GPC', GaussianProcessClassifier(1.0 * RBF(1.0))),
                ]
# Create Base Learners
base_learners = [
                 ('RF', RandomForestClassifier(n_estimators= 200, 
                                                   oob_score = True, 
                                                   class_weight = "balanced", 
                                                   random_state = 20, 
                                                   ccp_alpha = 0.1)), 
                 ('KNNC', KNeighborsClassifier(n_neighbors = len(np.unique(y0))
                                                     , weights = 'distance')),
                #  ('SVC', SVC(kernel = 'linear', probability=True,
                #                    class_weight = 'balanced'
                #                   , break_ties = True)), 

                 ('GNB', GaussianNB()), 
                 ('QDA',  QuadraticDiscriminantAnalysis()), 
                 ('MLPClassifier', MLPClassifier(alpha=1, max_iter=1000)), 
                 # ('DT', DecisionTreeClassifier(max_depth=5)),
                 # ('GPC', GaussianProcessClassifier(1.0 * RBF(1.0))),
                ]

Load pre-trained level-1 data¶

If it's unclear how to obtain the pre-trained dataset (e.g. probability matrices from base classifiers), please refer back to part 1 or part 2 of this demo series.

In [ ]:

Copied!





import cf_models as cm

tLoadPretrained = False
######################
fold_number = 0
n_iterations = 1
data_dir = os.path.join(input_dir, 'data')
######################

if not tLoadPretrained:  
    # Use the previously selected base predictors (`base_learners`) to generate the level-1 dataset
    R, T, U, L_train, L_test = cm.demo_cf_stacking(input_data=(X0, y0), 
                                                   input_dir=input_dir, n_iter=n_iterations, 
                                                   base_learners=base_learners, # <<< base classifiers selected
                                                   verbose=1)
else: 
    R, T, U, L_train, L_test = dp.load_pretrained_level1_data(fold_number=fold_number, verbose=1, data_dir=data_dir)

# Derived quantities
n_train = R.shape[1]
p_threshold = uc.estimateProbThresholds(R, L=L_train, pos_label=1, policy='fmax')
lh = uc.estimateLabels(T, p_th=p_threshold) # We cannot use L_test (cheating), but we have to guesstimate
L = np.hstack((L_train, lh)) 
X = np.hstack((R, T))

assert len(U) == X.shape[0]
print(f"> shape(R):{R.shape} || shape(T): {T.shape} => shape(X): {X.shape}")
import cf_models as cm

tLoadPretrained = False
######################
fold_number = 0
n_iterations = 1
data_dir = os.path.join(input_dir, 'data')
######################

if not tLoadPretrained:  
    # Use the previously selected base predictors (`base_learners`) to generate the level-1 dataset
    R, T, U, L_train, L_test = cm.demo_cf_stacking(input_data=(X0, y0), 
                                                   input_dir=input_dir, n_iter=n_iterations, 
                                                   base_learners=base_learners, # <<< base classifiers selected
                                                   verbose=1)
else: 
    R, T, U, L_train, L_test = dp.load_pretrained_level1_data(fold_number=fold_number, verbose=1, data_dir=data_dir)

# Derived quantities
n_train = R.shape[1]
p_threshold = uc.estimateProbThresholds(R, L=L_train, pos_label=1, policy='fmax')
lh = uc.estimateLabels(T, p_th=p_threshold) # We cannot use L_test (cheating), but we have to guesstimate
L = np.hstack((L_train, lh)) 
X = np.hstack((R, T))

assert len(U) == X.shape[0]
print(f"> shape(R):{R.shape} || shape(T): {T.shape} => shape(X): {X.shape}")

2.8.0

  0%|          | 0/1 [00:00<?, ?it/s]

(BaseCF) base est | name: RF, estimator: RandomForestClassifier(ccp_alpha=0.1, class_weight='balanced', n_estimators=200,
                       oob_score=True, random_state=20)
(BaseCF) base est | name: KNNC, estimator: KNeighborsClassifier(n_neighbors=2, weights='distance')
(BaseCF) base est | name: GNB, estimator: GaussianNB()
(BaseCF) base est | name: QDA, estimator: QuadraticDiscriminantAnalysis()
(BaseCF) base est | name: MLPClassifier, estimator: MLPClassifier(alpha=1, max_iter=1000)
(BaseCF) Base predictors:
[1]  RF: RandomForestClassifier(ccp_alpha=0.1, class_weight='balanced', n_estimators=200,
                       oob_score=True, random_state=20)
[2]  QDA: QuadraticDiscriminantAnalysis()
[3]  MLPClassifier: MLPClassifier(alpha=1, max_iter=1000)
[4]  KNNC: KNeighborsClassifier(n_neighbors=2, weights='distance')
[5]  GNB: GaussianNB()

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   26.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   34.1s finished

[info] Saving X_meta (shape=(3750, 5)) at:
/content/drive/MyDrive/Colab Notebooks/machine_learning_examples/cf_ensemble/data/train-0.npz

[info] Saving X_meta (shape=(1250, 5)) at:
/content/drive/MyDrive/Colab Notebooks/machine_learning_examples/cf_ensemble/data/test-0.npz

[info] Saving X_meta (shape=(1250, 5)) at:
/content/drive/MyDrive/Colab Notebooks/machine_learning_examples/cf_ensemble/data/test-0.npz

[result] 0.11188811188811189
(cf_write) Adding new attribute y:
[0 0 0 0 0 ... 0 1 0 0 0]
...
(cf_write) Saving X_meta at:
/content/drive/MyDrive/Colab Notebooks/machine_learning_examples/cf_ensemble/data/test-0.npz

100%|██████████| 1/1 [01:24<00:00, 84.80s/it]

[info] list of base classifiers:
['RF' 'KNNC' 'GNB' 'QDA' 'MLPClassifier']

================================================================================
R: Rating/probability matrix for the TRAIN set
================================================================================
> shape(R):(5, 3750) || shape(T): (5, 1250) => shape(X): (5, 5000)

Confidence matrices¶

In [ ]:

Copied!





# import utils_cf as uc
# import polarity_models as pmodel

n_factors = 100
alpha = 100.0 
conf_measure = 'brier' # Options: 'brier', 'uniform'
policy_threshold = 'fmax'

Pc, C0, Cw, Cn, *rest = \
    uc.evalConfidenceMatrices(R, L_train, alpha=alpha, 
                                    p_threshold=p_threshold, 
                                    conf_measure=conf_measure, policy_threshold=policy_threshold, 
                                    
                                    # Optional debug/test parameters 
                                    U=U, n_train=n_train, fold_number=fold_number, 
                                    is_cascade=True,
                                    verbose=0)
assert C0.shape == R.shape
y_colors = pmodel.verify_colors(Pc)  # [log] status: ok
# import utils_cf as uc
# import polarity_models as pmodel

n_factors = 100
alpha = 100.0 
conf_measure = 'brier' # Options: 'brier', 'uniform'
policy_threshold = 'fmax'

Pc, C0, Cw, Cn, *rest = \
    uc.evalConfidenceMatrices(R, L_train, alpha=alpha, 
                                    p_threshold=p_threshold, 
                                    conf_measure=conf_measure, policy_threshold=policy_threshold, 
                                    
                                    # Optional debug/test parameters 
                                    U=U, n_train=n_train, fold_number=fold_number, 
                                    is_cascade=True,
                                    verbose=0)
assert C0.shape == R.shape
y_colors = pmodel.verify_colors(Pc)  # [log] status: ok

(make_cn) Using WEIGHTED confidence matrix to approximate ratings ...

Training CFNet¶

In [ ]:

Copied!





import cf_models as cm

n_users, n_items = R.shape

fold_number = 0
test_size = 0.1

policy_threshold = 'fmax'
conf_measure = 'brier'
n_factors = 100
alpha = 100

lr = 0.001 
batch_size = 64
epochs = 200

loss_fn = tf.keras.losses.BinaryCrossentropy() # Options: tf.keras.losses.BinaryCrossentropy(), tf.keras.losses.MeanSquaredError(), ...

# Configure `target_type` (Options: 'generic', 'rating', 'label')
# 1. Choose 'label' if the BCE loss is used (because the CF model in this case attempts to approximates the label encoded in 0 and 1)
# 2. Choose 'rating' if MSE is used (because the CF model in this case approximates the rating, which is a regression problem)
# 3. Choose 'generic' for customized loss function with potentially more complex labeling information where "y_true" is a matrix 
# 
# Note that you are unlikely need to configure `target_type` because cf_models module has a method that will determine this for you automatically
# target_type = 'label' # if we use BCE, then the model approximates the label

cf_model = cm.get_cfnet_compiled(n_users, n_items, n_factors, loss=loss_fn, lr=lr)
cf_model = cm.training_pipeline(input_model=(cf_model, loss_fn), 
                                input_data=(R, T, U, L_train, L_test), 

                          # Should we combine R and T into a single matrix X? Set to True if so
                          is_cascade = False, # Set to False here because we attempt to re-estimate T using kNNs

                          # lh = lh, # Estimated labels by default are the majority vote 
                          
                          # SGD optimization parameters
                          test_size = test_size,
                          epochs = epochs, 
                          batch_size=batch_size, 

                          # CF hyperparameters
                          # n_factors=n_factors, # this is factored into model definition
                          alpha=alpha, 
                          conf_measure=conf_measure, 
                          # conf_type='Cn', # default sparse confidence matrix (Cn)
                          # target_type=target_type,
                          
                          policy_threshold=policy_threshold, 
                          fold_number=fold_number) 
import cf_models as cm

n_users, n_items = R.shape

fold_number = 0
test_size = 0.1

policy_threshold = 'fmax'
conf_measure = 'brier'
n_factors = 100
alpha = 100

lr = 0.001 
batch_size = 64
epochs = 200

loss_fn = tf.keras.losses.BinaryCrossentropy() # Options: tf.keras.losses.BinaryCrossentropy(), tf.keras.losses.MeanSquaredError(), ...

# Configure `target_type` (Options: 'generic', 'rating', 'label')
# 1. Choose 'label' if the BCE loss is used (because the CF model in this case attempts to approximates the label encoded in 0 and 1)
# 2. Choose 'rating' if MSE is used (because the CF model in this case approximates the rating, which is a regression problem)
# 3. Choose 'generic' for customized loss function with potentially more complex labeling information where "y_true" is a matrix 
# 
# Note that you are unlikely need to configure `target_type` because cf_models module has a method that will determine this for you automatically
# target_type = 'label' # if we use BCE, then the model approximates the label

cf_model = cm.get_cfnet_compiled(n_users, n_items, n_factors, loss=loss_fn, lr=lr)
cf_model = cm.training_pipeline(input_model=(cf_model, loss_fn), 
                                input_data=(R, T, U, L_train, L_test), 

                          # Should we combine R and T into a single matrix X? Set to True if so
                          is_cascade = False, # Set to False here because we attempt to re-estimate T using kNNs

                          # lh = lh, # Estimated labels by default are the majority vote 
                          
                          # SGD optimization parameters
                          test_size = test_size,
                          epochs = epochs, 
                          batch_size=batch_size, 

                          # CF hyperparameters
                          # n_factors=n_factors, # this is factored into model definition
                          alpha=alpha, 
                          conf_measure=conf_measure, 
                          # conf_type='Cn', # default sparse confidence matrix (Cn)
                          # target_type=target_type,
                          
                          policy_threshold=policy_threshold, 
                          fold_number=fold_number) 

(make_cn) Using WEIGHTED confidence matrix to approximate ratings ...
[info] Confidence matrix type: Cn, target data type: label
Epoch 1/200
264/264 [==============================] - 4s 9ms/step - loss: 3.2968 - val_loss: 3.4268
Epoch 2/200
264/264 [==============================] - 1s 5ms/step - loss: 3.8964 - val_loss: 2.7115
Epoch 3/200
264/264 [==============================] - 1s 5ms/step - loss: 5.7728 - val_loss: 4.9412
Epoch 4/200
264/264 [==============================] - 1s 6ms/step - loss: 2.4199 - val_loss: 1.9336
Epoch 5/200
264/264 [==============================] - 1s 5ms/step - loss: 1.2328 - val_loss: 1.7244
Epoch 6/200
264/264 [==============================] - 2s 6ms/step - loss: 0.9934 - val_loss: 1.5784
Epoch 7/200
264/264 [==============================] - 1s 5ms/step - loss: 0.8551 - val_loss: 1.4817
Epoch 8/200
264/264 [==============================] - 1s 6ms/step - loss: 0.7566 - val_loss: 1.4098
Epoch 9/200
264/264 [==============================] - 1s 6ms/step - loss: 0.6829 - val_loss: 1.3618
Epoch 10/200
264/264 [==============================] - 1s 6ms/step - loss: 0.6331 - val_loss: 1.3251
Epoch 11/200
264/264 [==============================] - 1s 5ms/step - loss: 0.5965 - val_loss: 1.3035
Epoch 12/200
264/264 [==============================] - 1s 5ms/step - loss: 0.5663 - val_loss: 1.2753
Epoch 13/200
264/264 [==============================] - 1s 5ms/step - loss: 0.5445 - val_loss: 1.2593
Epoch 14/200
264/264 [==============================] - 2s 7ms/step - loss: 0.5252 - val_loss: 1.2421
Epoch 15/200
264/264 [==============================] - 1s 5ms/step - loss: 0.5111 - val_loss: 1.2230
Epoch 16/200
264/264 [==============================] - 1s 6ms/step - loss: 0.4965 - val_loss: 1.2075
Epoch 17/200
264/264 [==============================] - 1s 5ms/step - loss: 0.4867 - val_loss: 1.2036
Epoch 18/200
264/264 [==============================] - 1s 5ms/step - loss: 0.4735 - val_loss: 1.1880
Epoch 19/200
264/264 [==============================] - 1s 6ms/step - loss: 0.4693 - val_loss: 1.1869
Epoch 20/200
264/264 [==============================] - 1s 5ms/step - loss: 0.4613 - val_loss: 1.1713
Epoch 21/200
264/264 [==============================] - 1s 6ms/step - loss: 0.4513 - val_loss: 1.1555
Epoch 22/200
264/264 [==============================] - 1s 5ms/step - loss: 0.4406 - val_loss: 1.1405
Epoch 23/200
264/264 [==============================] - 1s 5ms/step - loss: 0.4341 - val_loss: 1.1389
Epoch 24/200
264/264 [==============================] - 2s 6ms/step - loss: 0.4273 - val_loss: 1.1255
Epoch 25/200
264/264 [==============================] - 1s 6ms/step - loss: 0.4190 - val_loss: 1.1011
Epoch 26/200
264/264 [==============================] - 1s 5ms/step - loss: 0.4070 - val_loss: 1.0893
Epoch 27/200
264/264 [==============================] - 2s 6ms/step - loss: 0.3966 - val_loss: 1.0736
Epoch 28/200
264/264 [==============================] - 1s 5ms/step - loss: 0.3893 - val_loss: 1.0563
Epoch 29/200
264/264 [==============================] - 1s 5ms/step - loss: 0.3797 - val_loss: 1.0431
Epoch 30/200
264/264 [==============================] - 1s 6ms/step - loss: 0.3710 - val_loss: 1.0281
Epoch 31/200
264/264 [==============================] - 1s 5ms/step - loss: 0.3614 - val_loss: 1.0096
Epoch 32/200
264/264 [==============================] - 1s 5ms/step - loss: 0.3513 - val_loss: 0.9945
Epoch 33/200
264/264 [==============================] - 1s 5ms/step - loss: 0.3432 - val_loss: 0.9771
Epoch 34/200
264/264 [==============================] - 2s 6ms/step - loss: 0.3340 - val_loss: 0.9605
Epoch 35/200
264/264 [==============================] - 1s 6ms/step - loss: 0.3268 - val_loss: 0.9467
Epoch 36/200
264/264 [==============================] - 1s 6ms/step - loss: 0.3174 - val_loss: 0.9342
Epoch 37/200
264/264 [==============================] - 1s 6ms/step - loss: 0.3106 - val_loss: 0.9178
Epoch 38/200
264/264 [==============================] - 1s 6ms/step - loss: 0.3022 - val_loss: 0.9035
Epoch 39/200
264/264 [==============================] - 1s 5ms/step - loss: 0.2951 - val_loss: 0.8903
Epoch 40/200
264/264 [==============================] - 1s 5ms/step - loss: 0.2886 - val_loss: 0.8780
Epoch 41/200
264/264 [==============================] - 1s 6ms/step - loss: 0.2827 - val_loss: 0.8666
Epoch 42/200
264/264 [==============================] - 1s 5ms/step - loss: 0.2776 - val_loss: 0.8562
Epoch 43/200
264/264 [==============================] - 1s 5ms/step - loss: 0.2729 - val_loss: 0.8460
Epoch 44/200
264/264 [==============================] - 2s 6ms/step - loss: 0.2684 - val_loss: 0.8362
Epoch 45/200
264/264 [==============================] - 1s 5ms/step - loss: 0.2641 - val_loss: 0.8269
Epoch 46/200
264/264 [==============================] - 1s 6ms/step - loss: 0.2600 - val_loss: 0.8175
Epoch 47/200
264/264 [==============================] - 1s 6ms/step - loss: 0.2561 - val_loss: 0.8085
Epoch 48/200
264/264 [==============================] - 1s 6ms/step - loss: 0.2521 - val_loss: 0.7994
Epoch 49/200
264/264 [==============================] - 2s 6ms/step - loss: 0.2483 - val_loss: 0.7906
Epoch 50/200
264/264 [==============================] - 1s 5ms/step - loss: 0.2445 - val_loss: 0.7816
Epoch 51/200
264/264 [==============================] - 1s 5ms/step - loss: 0.2411 - val_loss: 0.7730
Epoch 52/200
264/264 [==============================] - 1s 5ms/step - loss: 0.2381 - val_loss: 0.7659
Epoch 53/200
264/264 [==============================] - 1s 5ms/step - loss: 0.2351 - val_loss: 0.7571
Epoch 54/200
264/264 [==============================] - 2s 6ms/step - loss: 0.2312 - val_loss: 0.7480
Epoch 55/200
264/264 [==============================] - 1s 6ms/step - loss: 0.2277 - val_loss: 0.7407
Epoch 56/200
264/264 [==============================] - 2s 6ms/step - loss: 0.2245 - val_loss: 0.7316
Epoch 57/200
264/264 [==============================] - 1s 6ms/step - loss: 0.2211 - val_loss: 0.7225
Epoch 58/200
264/264 [==============================] - 1s 5ms/step - loss: 0.2178 - val_loss: 0.7144
Epoch 59/200
264/264 [==============================] - 1s 5ms/step - loss: 0.2143 - val_loss: 0.7066
Epoch 60/200
264/264 [==============================] - 1s 5ms/step - loss: 0.2123 - val_loss: 0.6985
Epoch 61/200
264/264 [==============================] - 1s 5ms/step - loss: 0.2094 - val_loss: 0.6908
Epoch 62/200
264/264 [==============================] - 1s 5ms/step - loss: 0.2056 - val_loss: 0.6835
Epoch 63/200
264/264 [==============================] - 1s 5ms/step - loss: 0.2022 - val_loss: 0.6750
Epoch 64/200
264/264 [==============================] - 1s 6ms/step - loss: 0.1986 - val_loss: 0.6671
Epoch 65/200
264/264 [==============================] - 1s 6ms/step - loss: 0.1957 - val_loss: 0.6597
Epoch 66/200
264/264 [==============================] - 2s 6ms/step - loss: 0.1932 - val_loss: 0.6523
Epoch 67/200
264/264 [==============================] - 1s 5ms/step - loss: 0.1910 - val_loss: 0.6444
Epoch 68/200
264/264 [==============================] - 1s 5ms/step - loss: 0.1877 - val_loss: 0.6385
Epoch 69/200
264/264 [==============================] - 1s 5ms/step - loss: 0.1856 - val_loss: 0.6321
Epoch 70/200
264/264 [==============================] - 1s 5ms/step - loss: 0.1831 - val_loss: 0.6228
Epoch 71/200
264/264 [==============================] - 1s 6ms/step - loss: 0.1787 - val_loss: 0.6150
Epoch 72/200
264/264 [==============================] - 1s 5ms/step - loss: 0.1758 - val_loss: 0.6076
Epoch 73/200
264/264 [==============================] - 1s 5ms/step - loss: 0.1737 - val_loss: 0.6008
Epoch 74/200
264/264 [==============================] - 1s 5ms/step - loss: 0.1708 - val_loss: 0.5942
Epoch 75/200
264/264 [==============================] - 2s 6ms/step - loss: 0.1683 - val_loss: 0.5876
Epoch 76/200
264/264 [==============================] - 2s 6ms/step - loss: 0.1667 - val_loss: 0.5808
Epoch 77/200
264/264 [==============================] - 1s 6ms/step - loss: 0.1649 - val_loss: 0.5745
Epoch 78/200
264/264 [==============================] - 2s 6ms/step - loss: 0.1613 - val_loss: 0.5684
Epoch 79/200
264/264 [==============================] - 2s 6ms/step - loss: 0.1590 - val_loss: 0.5604
Epoch 80/200
264/264 [==============================] - 1s 6ms/step - loss: 0.1558 - val_loss: 0.5538
Epoch 81/200
264/264 [==============================] - 1s 6ms/step - loss: 0.1535 - val_loss: 0.5473
Epoch 82/200
264/264 [==============================] - 1s 6ms/step - loss: 0.1511 - val_loss: 0.5413
Epoch 83/200
264/264 [==============================] - 2s 6ms/step - loss: 0.1495 - val_loss: 0.5360
Epoch 84/200
264/264 [==============================] - 2s 6ms/step - loss: 0.1483 - val_loss: 0.5326
Epoch 85/200
264/264 [==============================] - 2s 6ms/step - loss: 0.1455 - val_loss: 0.5226
Epoch 86/200
264/264 [==============================] - 1s 6ms/step - loss: 0.1425 - val_loss: 0.5167
Epoch 87/200
264/264 [==============================] - 2s 7ms/step - loss: 0.1400 - val_loss: 0.5108
Epoch 88/200
264/264 [==============================] - 2s 6ms/step - loss: 0.1379 - val_loss: 0.5049
Epoch 89/200
264/264 [==============================] - 1s 5ms/step - loss: 0.1358 - val_loss: 0.4988
Epoch 90/200
264/264 [==============================] - 1s 6ms/step - loss: 0.1339 - val_loss: 0.4936
Epoch 91/200
264/264 [==============================] - 2s 6ms/step - loss: 0.1321 - val_loss: 0.4885
Epoch 92/200
264/264 [==============================] - 1s 5ms/step - loss: 0.1303 - val_loss: 0.4847
Epoch 93/200
264/264 [==============================] - 2s 6ms/step - loss: 0.1287 - val_loss: 0.4774
Epoch 94/200
264/264 [==============================] - 1s 6ms/step - loss: 0.1259 - val_loss: 0.4708
Epoch 95/200
264/264 [==============================] - 2s 6ms/step - loss: 0.1237 - val_loss: 0.4653
Epoch 96/200
264/264 [==============================] - 2s 6ms/step - loss: 0.1217 - val_loss: 0.4599
Epoch 97/200
264/264 [==============================] - 2s 6ms/step - loss: 0.1198 - val_loss: 0.4544
Epoch 98/200
264/264 [==============================] - 2s 7ms/step - loss: 0.1181 - val_loss: 0.4495
Epoch 99/200
264/264 [==============================] - 1s 6ms/step - loss: 0.1163 - val_loss: 0.4448
Epoch 100/200
264/264 [==============================] - 2s 6ms/step - loss: 0.1153 - val_loss: 0.4394
Epoch 101/200
264/264 [==============================] - 2s 6ms/step - loss: 0.1133 - val_loss: 0.4342
Epoch 102/200
264/264 [==============================] - 2s 6ms/step - loss: 0.1110 - val_loss: 0.4289
Epoch 103/200
264/264 [==============================] - 1s 6ms/step - loss: 0.1098 - val_loss: 0.4247
Epoch 104/200
264/264 [==============================] - 1s 5ms/step - loss: 0.1076 - val_loss: 0.4193
Epoch 105/200
264/264 [==============================] - 2s 6ms/step - loss: 0.1059 - val_loss: 0.4143
Epoch 106/200
264/264 [==============================] - 1s 5ms/step - loss: 0.1042 - val_loss: 0.4098
Epoch 107/200
264/264 [==============================] - 1s 5ms/step - loss: 0.1027 - val_loss: 0.4054
Epoch 108/200
264/264 [==============================] - 1s 6ms/step - loss: 0.1012 - val_loss: 0.4012
Epoch 109/200
264/264 [==============================] - 1s 5ms/step - loss: 0.0997 - val_loss: 0.3959
Epoch 110/200
264/264 [==============================] - 1s 5ms/step - loss: 0.0982 - val_loss: 0.3916
Epoch 111/200
264/264 [==============================] - 2s 6ms/step - loss: 0.0965 - val_loss: 0.3869
Epoch 112/200
264/264 [==============================] - 1s 6ms/step - loss: 0.0949 - val_loss: 0.3826
Epoch 113/200
264/264 [==============================] - 2s 6ms/step - loss: 0.0935 - val_loss: 0.3780
Epoch 114/200
264/264 [==============================] - 1s 6ms/step - loss: 0.0919 - val_loss: 0.3739
Epoch 115/200
264/264 [==============================] - 2s 6ms/step - loss: 0.0908 - val_loss: 0.3703
Epoch 116/200
264/264 [==============================] - 2s 6ms/step - loss: 0.0894 - val_loss: 0.3657
Epoch 117/200
264/264 [==============================] - 2s 6ms/step - loss: 0.0882 - val_loss: 0.3614
Epoch 118/200
264/264 [==============================] - 2s 6ms/step - loss: 0.0862 - val_loss: 0.3571
Epoch 119/200
264/264 [==============================] - 2s 6ms/step - loss: 0.0849 - val_loss: 0.3534
Epoch 120/200
264/264 [==============================] - 2s 6ms/step - loss: 0.0836 - val_loss: 0.3494
Epoch 121/200
264/264 [==============================] - 1s 6ms/step - loss: 0.0823 - val_loss: 0.3454
Epoch 122/200
264/264 [==============================] - 1s 5ms/step - loss: 0.0812 - val_loss: 0.3421
Epoch 123/200
264/264 [==============================] - 2s 6ms/step - loss: 0.0800 - val_loss: 0.3379
Epoch 124/200
264/264 [==============================] - 1s 5ms/step - loss: 0.0786 - val_loss: 0.3340
Epoch 125/200
264/264 [==============================] - 1s 5ms/step - loss: 0.0775 - val_loss: 0.3302
Epoch 126/200
264/264 [==============================] - 2s 6ms/step - loss: 0.0760 - val_loss: 0.3268
Epoch 127/200
264/264 [==============================] - 1s 5ms/step - loss: 0.0748 - val_loss: 0.3231
Epoch 128/200
264/264 [==============================] - 1s 5ms/step - loss: 0.0736 - val_loss: 0.3194
Epoch 129/200
264/264 [==============================] - 2s 6ms/step - loss: 0.0726 - val_loss: 0.3159
Epoch 130/200
264/264 [==============================] - 2s 6ms/step - loss: 0.0715 - val_loss: 0.3126
Epoch 131/200
264/264 [==============================] - 1s 5ms/step - loss: 0.0703 - val_loss: 0.3090
Epoch 132/200
264/264 [==============================] - 1s 5ms/step - loss: 0.0691 - val_loss: 0.3056
Epoch 133/200
264/264 [==============================] - 1s 6ms/step - loss: 0.0681 - val_loss: 0.3019
Epoch 134/200
264/264 [==============================] - 1s 6ms/step - loss: 0.0670 - val_loss: 0.2991
Epoch 135/200
264/264 [==============================] - 2s 6ms/step - loss: 0.0659 - val_loss: 0.2950
Epoch 136/200
264/264 [==============================] - 2s 6ms/step - loss: 0.0650 - val_loss: 0.2924
Epoch 137/200
264/264 [==============================] - 2s 6ms/step - loss: 0.0638 - val_loss: 0.2891
Epoch 138/200
264/264 [==============================] - 2s 6ms/step - loss: 0.0628 - val_loss: 0.2859
Epoch 139/200
264/264 [==============================] - 2s 6ms/step - loss: 0.0619 - val_loss: 0.2831
Epoch 140/200
264/264 [==============================] - 2s 6ms/step - loss: 0.0609 - val_loss: 0.2802
Epoch 141/200
264/264 [==============================] - 2s 6ms/step - loss: 0.0599 - val_loss: 0.2771
Epoch 142/200
264/264 [==============================] - 2s 6ms/step - loss: 0.0589 - val_loss: 0.2743
Epoch 143/200
264/264 [==============================] - 2s 6ms/step - loss: 0.0581 - val_loss: 0.2712
Epoch 144/200
264/264 [==============================] - 2s 6ms/step - loss: 0.0571 - val_loss: 0.2682
Epoch 145/200
264/264 [==============================] - 2s 6ms/step - loss: 0.0562 - val_loss: 0.2653
Epoch 146/200
264/264 [==============================] - 1s 6ms/step - loss: 0.0553 - val_loss: 0.2631
Epoch 147/200
264/264 [==============================] - 2s 6ms/step - loss: 0.0545 - val_loss: 0.2597
Epoch 148/200
264/264 [==============================] - 1s 5ms/step - loss: 0.0535 - val_loss: 0.2570
Epoch 149/200
264/264 [==============================] - 2s 6ms/step - loss: 0.0527 - val_loss: 0.2542
Epoch 150/200
264/264 [==============================] - 2s 6ms/step - loss: 0.0518 - val_loss: 0.2517
Epoch 151/200
264/264 [==============================] - 2s 6ms/step - loss: 0.0511 - val_loss: 0.2491
Epoch 152/200
264/264 [==============================] - 1s 5ms/step - loss: 0.0502 - val_loss: 0.2465
Epoch 153/200
264/264 [==============================] - 2s 6ms/step - loss: 0.0495 - val_loss: 0.2439
Epoch 154/200
264/264 [==============================] - 1s 6ms/step - loss: 0.0487 - val_loss: 0.2413
Epoch 155/200
264/264 [==============================] - 1s 6ms/step - loss: 0.0478 - val_loss: 0.2386
Epoch 156/200
264/264 [==============================] - 2s 6ms/step - loss: 0.0471 - val_loss: 0.2362
Epoch 157/200
264/264 [==============================] - 2s 6ms/step - loss: 0.0464 - val_loss: 0.2339
Epoch 158/200
264/264 [==============================] - 2s 6ms/step - loss: 0.0456 - val_loss: 0.2316
Epoch 159/200
264/264 [==============================] - 2s 6ms/step - loss: 0.0449 - val_loss: 0.2291
Epoch 160/200
264/264 [==============================] - 1s 5ms/step - loss: 0.0442 - val_loss: 0.2267
Epoch 161/200
264/264 [==============================] - 1s 5ms/step - loss: 0.0435 - val_loss: 0.2244
Epoch 162/200
264/264 [==============================] - 1s 6ms/step - loss: 0.0428 - val_loss: 0.2221
Epoch 163/200
264/264 [==============================] - 2s 6ms/step - loss: 0.0421 - val_loss: 0.2197
Epoch 164/200
264/264 [==============================] - 2s 6ms/step - loss: 0.0414 - val_loss: 0.2176
Epoch 165/200
264/264 [==============================] - 2s 6ms/step - loss: 0.0408 - val_loss: 0.2155
Epoch 166/200
264/264 [==============================] - 2s 6ms/step - loss: 0.0401 - val_loss: 0.2135
Epoch 167/200
264/264 [==============================] - 1s 6ms/step - loss: 0.0395 - val_loss: 0.2114
Epoch 168/200
264/264 [==============================] - 2s 6ms/step - loss: 0.0388 - val_loss: 0.2091
Epoch 169/200
264/264 [==============================] - 1s 6ms/step - loss: 0.0382 - val_loss: 0.2070
Epoch 170/200
264/264 [==============================] - 2s 6ms/step - loss: 0.0376 - val_loss: 0.2052
Epoch 171/200
264/264 [==============================] - 2s 6ms/step - loss: 0.0370 - val_loss: 0.2030
Epoch 172/200
264/264 [==============================] - 1s 6ms/step - loss: 0.0364 - val_loss: 0.2011
Epoch 173/200
264/264 [==============================] - 1s 5ms/step - loss: 0.0358 - val_loss: 0.1991
Epoch 174/200
264/264 [==============================] - 1s 5ms/step - loss: 0.0353 - val_loss: 0.1971
Epoch 175/200
264/264 [==============================] - 1s 5ms/step - loss: 0.0347 - val_loss: 0.1953
Epoch 176/200
264/264 [==============================] - 1s 6ms/step - loss: 0.0342 - val_loss: 0.1935
Epoch 177/200
264/264 [==============================] - 1s 5ms/step - loss: 0.0336 - val_loss: 0.1917
Epoch 178/200
264/264 [==============================] - 2s 6ms/step - loss: 0.0331 - val_loss: 0.1898
Epoch 179/200
264/264 [==============================] - 2s 6ms/step - loss: 0.0325 - val_loss: 0.1880
Epoch 180/200
264/264 [==============================] - 1s 6ms/step - loss: 0.0320 - val_loss: 0.1864
Epoch 181/200
264/264 [==============================] - 1s 6ms/step - loss: 0.0315 - val_loss: 0.1844
Epoch 182/200
264/264 [==============================] - 2s 6ms/step - loss: 0.0310 - val_loss: 0.1829
Epoch 183/200
264/264 [==============================] - 2s 6ms/step - loss: 0.0305 - val_loss: 0.1811
Epoch 184/200
264/264 [==============================] - 2s 6ms/step - loss: 0.0300 - val_loss: 0.1796
Epoch 185/200
264/264 [==============================] - 2s 6ms/step - loss: 0.0295 - val_loss: 0.1777
Epoch 186/200
264/264 [==============================] - 2s 6ms/step - loss: 0.0291 - val_loss: 0.1761
Epoch 187/200
264/264 [==============================] - 2s 6ms/step - loss: 0.0286 - val_loss: 0.1746
Epoch 188/200
264/264 [==============================] - 2s 6ms/step - loss: 0.0281 - val_loss: 0.1729
Epoch 189/200
264/264 [==============================] - 1s 6ms/step - loss: 0.0277 - val_loss: 0.1715
Epoch 190/200
264/264 [==============================] - 2s 6ms/step - loss: 0.0273 - val_loss: 0.1697
Epoch 191/200
264/264 [==============================] - 2s 6ms/step - loss: 0.0268 - val_loss: 0.1682
Epoch 192/200
264/264 [==============================] - 2s 6ms/step - loss: 0.0264 - val_loss: 0.1667
Epoch 193/200
264/264 [==============================] - 1s 6ms/step - loss: 0.0260 - val_loss: 0.1651
Epoch 194/200
264/264 [==============================] - 2s 6ms/step - loss: 0.0256 - val_loss: 0.1638
Epoch 195/200
264/264 [==============================] - 2s 6ms/step - loss: 0.0252 - val_loss: 0.1623
Epoch 196/200
264/264 [==============================] - 1s 6ms/step - loss: 0.0247 - val_loss: 0.1609
Epoch 197/200
264/264 [==============================] - 2s 6ms/step - loss: 0.0244 - val_loss: 0.1596
Epoch 198/200
264/264 [==============================] - 1s 6ms/step - loss: 0.0240 - val_loss: 0.1582
Epoch 199/200
264/264 [==============================] - 1s 6ms/step - loss: 0.0236 - val_loss: 0.1568
Epoch 200/200
264/264 [==============================] - 2s 6ms/step - loss: 0.0232 - val_loss: 0.1556

In [ ]:

Copied!

%load_ext tensorboard
%tensorboard --logdir logs
%load_ext tensorboard
%tensorboard --logdir logs

Output hidden; open in https://colab.research.google.com to view.

Fast K nearest neighbors¶

scikit learn's KNN does not scale well; use Facebook's faiss library instead

In [ ]:

Copied!





# install openMP (as a prerequisite prior to installing faiss)
!sudo apt-get install libomp-dev 
# => doing so allows for "pip install faiss"
# => which then also allows for "import utils_knn"
# install openMP (as a prerequisite prior to installing faiss)
!sudo apt-get install libomp-dev 
# => doing so allows for "pip install faiss"
# => which then also allows for "import utils_knn"

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  libomp5
Suggested packages:
  libomp-doc
The following NEW packages will be installed:
  libomp-dev libomp5
0 upgraded, 2 newly installed, 0 to remove and 39 not upgraded.
Need to get 239 kB of archives.
After this operation, 804 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libomp5 amd64 5.0.1-1 [234 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libomp-dev amd64 5.0.1-1 [5,088 B]
Fetched 239 kB in 1s (419 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 76, <> line 2.)
debconf: falling back to frontend: Readline
debconf: unable to initialize frontend: Readline
debconf: (This frontend requires a controlling tty.)
debconf: falling back to frontend: Teletype
dpkg-preconfigure: unable to re-open stdin: 
Selecting previously unselected package libomp5:amd64.
(Reading database ... 155455 files and directories currently installed.)
Preparing to unpack .../libomp5_5.0.1-1_amd64.deb ...
Unpacking libomp5:amd64 (5.0.1-1) ...
Selecting previously unselected package libomp-dev.
Preparing to unpack .../libomp-dev_5.0.1-1_amd64.deb ...
Unpacking libomp-dev (5.0.1-1) ...
Setting up libomp5:amd64 (5.0.1-1) ...
Setting up libomp-dev (5.0.1-1) ...
Processing triggers for libc-bin (2.27-3ubuntu1.3) ...
/sbin/ldconfig.real: /usr/local/lib/python3.7/dist-packages/ideep4py/lib/libmkldnn.so.0 is not a symbolic link

In [ ]:

Copied!





# import utils_sys as usys
try: 
    import faiss
except: 
    # pip install faiss
    usys.install('faiss')
    import faiss

import knn_models
# import utils_sys as usys
try: 
    import faiss
except: 
    # pip install faiss
    usys.install('faiss')
    import faiss

import knn_models

In [ ]:

Copied!





from utilities import normalize
import scipy.sparse as sparse
# from sklearn.preprocessing import normalize

class FaissKNN:
    def __init__(self, k=5, normalize=False):
        self.index = None
        self.y = None
        self.y_tag = None # other meta data for the label/target such as polarities, colors
        self.k = k
        self.normalize_input = normalize

    def fit(self, X, y):
        self.index = faiss.IndexFlatL2(X.shape[1]) # Each x in X is in row-vector format i.e. X has shape  (n_instances, n_dim)
        # Note: Rating matrix (X), however, is in column-vector format; therefore, we need to remember to take transpose before using it as an input
  
        if self.normalize_input: 
            X = normalize(X, axis=1) # X is in row-vector format

        self.index.add(X.astype(np.float32))
        self.y = y

    def predict(self, X):
        distances, indices = self.index.search(X.astype(np.float32), k=self.k)
        # shape(distances): (n_instances, k)
        # shape(indices):   (n_instances, k)

        votes = self.y[indices] # note: shape(votes)=shape(indices)
        predictions = np.array([np.argmax(np.bincount(x)) for x in votes])
        # np.bincount([1, 1, 1, 0, 1, 0, 0, 0, 1, 1]) 
        # ~> array([4, 6]) because index 0 occurs 4 times, and 1 occurs 6 times
        return predictions
    def search(self, X): 
        distances, indices = self.index.search(X.astype(np.float32), k=self.k)
        return distances, indices
        
from utilities import normalize
import scipy.sparse as sparse
# from sklearn.preprocessing import normalize

class FaissKNN:
    def __init__(self, k=5, normalize=False):
        self.index = None
        self.y = None
        self.y_tag = None # other meta data for the label/target such as polarities, colors
        self.k = k
        self.normalize_input = normalize

    def fit(self, X, y):
        self.index = faiss.IndexFlatL2(X.shape[1]) # Each x in X is in row-vector format i.e. X has shape  (n_instances, n_dim)
        # Note: Rating matrix (X), however, is in column-vector format; therefore, we need to remember to take transpose before using it as an input
  
        if self.normalize_input: 
            X = normalize(X, axis=1) # X is in row-vector format

        self.index.add(X.astype(np.float32))
        self.y = y

    def predict(self, X):
        distances, indices = self.index.search(X.astype(np.float32), k=self.k)
        # shape(distances): (n_instances, k)
        # shape(indices):   (n_instances, k)

        votes = self.y[indices] # note: shape(votes)=shape(indices)
        predictions = np.array([np.argmax(np.bincount(x)) for x in votes])
        # np.bincount([1, 1, 1, 0, 1, 0, 0, 0, 1, 1]) 
        # ~> array([4, 6]) because index 0 occurs 4 times, and 1 occurs 6 times
        return predictions
    def search(self, X): 
        distances, indices = self.index.search(X.astype(np.float32), k=self.k)
        return distances, indices
        

In [ ]:

Copied!





from numpy import linalg as LA
from analyzer import is_sparse
from sklearn.preprocessing import normalize
import data_pipeline as dp
import utils_knn as uknn
from collections import namedtuple
from sklearn.metrics import f1_score
# import polarity_models as pmodel
# from polarity_models import Polarity

def predict_by_knn(model, model_knn, R, T, L_train, L_test, C, Pc, codes={}, pos_label=1, verbose=1): 
    """
    
    Parameters 
    ----------
    model: An instance of CFNet that has been pre-trained 
    model_knn: An instance of Faiss KNN model that has been pre-trained

    R:  probability/rating matrix of the training split
    T: 
    L_train:
    L_test: 
    C: 
    Pc: color matrix of the training split  

    """ 
    if verbose: np.set_printoptions(precision=3, edgeitems=5, suppress=True)

    # Convert rating matrices back to typical ML training set format
    X_train = R.T
    X_test = T.T

    # Find kNNs for each test instances in T
    distances, knn_indices = model_knn.search(X_test)

    N, k = knn_indices.shape # `knn_indices` is a k-by-N matrix, where k as in kNN and N is the sample size 
    n_users = T.shape[0]

    assert N == T.shape[1], f"Size of test set: {T.shape[1]} inconsistent what's inferred from knn indices: {N}"
    assert R.shape == Pc.shape

    if len(codes) == 0: codes = Polarity.codes

    if is_sparse(Pc): Pc = Pc.A #

    # Infer true labels (L_train) from color matrix
    L_train = pmodel.color_matrix_to_labels(Pc, codes=codes) # True labels for R
    n_unreliable_knn_cases = 0
    col_user, col_item, col_value = 'user', 'item', 'rating'

    Th = np.zeros_like(T, dtype='float32') # Initialize the re-estimated test set (Th) for T
    T_knn_best = np.zeros_like(T, dtype='float32')
    T_avg = np.zeros_like(T, dtype='float32')
    T_masked_avg = np.zeros_like(T, dtype='float32')
    Th_reliable = np.zeros_like(T, dtype='float32') # unreliable entries are marked by special number (e.g. 0)

    T_pred = {} # keep track of various predictied outputs according to different strategies
    T_pred['knn_max'] = []

    # kNN top of the top (rank kNNs further by their entropy values, the smaller the better)
    # L_knn, top_indices = uknn.estimate_labels_by_rank(model_knn, T, Pc, topn=min(3, k), 
    #                                                    rank_fn=uknn.compute_entropy, 
    #                                                    larger_is_better=False, 
    #                                                    verbose=0)
    msg = ''
    test_points = np.random.choice(range(N), 10)
    for i in tqdm(range(N)):  # foreach position in the test split (T)
        knn_idx = knn_indices[i] # test point (i)'s k nearest neighbors in R (in terms of their indices)
        # knn_idx = top_indices[i]

        Pc_i = Pc[:, knn_idx].astype(int) # subset the color matrix at kNN indices

        # Method #1 Majority vote: Use the label determined by majority vote within kNNs
        L_knn_i = pmodel.color_matrix_to_labels(Pc_i, codes=codes) # kNN's labels
        ti_knn_max = np.argmax( np.bincount(L_knn_i) ) # kNN-predicted label by majority vote
        # ti_knn_max = L_knn[i]
        T_pred['knn_max'].append(ti_knn_max)

        # Gather statistics
        ni = Pc_i.size # ~ T.size
        ntp = np.sum(Pc_i == codes['tp'])
        ntn = np.sum(Pc_i == codes['tn'])
        nfp = np.sum(Pc_i == codes['fp'])
        nfn = np.sum(Pc_i == codes['fn'])

        if (ntp+ntn)==0: # None of the base classifiers (users) made any correct predictions within these kNNs
            n_unreliable_knn_cases += 1

        # [Test]
        if verbose > 1: 
            msg += f"[info] test point index: {i}\n" + '#' * 50 + '\n'
            msg += f"> T({i}):\n{T[:, i]}\n"
            msg += f"> R({i}):\n{R[:, knn_idx[0]]}\n" # point in R closest to the current test point T[:, i]
            msg += f"> Pc_i(shape={Pc_i.shape}):\n{Pc_i}\n"
            msg += f"> L_knn(size={len(L_knn)}):\n{L_knn}\n"
            msg += f"> label prediction (knn) => {ti_knn_max}\n"

        # Method #2 Best uses: foreach base classifier prediction in ti, use the "best" among these kNNs (majority vote followed by restiamte)
        max_colors, max_indices = [], []
        for u in range(n_users): 
            color, pos = uknn.most_common_element_and_position(Pc_i[u, :], pos_key_only=True)
            max_colors.append(color)
            max_indices.append(knn_idx[pos]) # we also want the knn index
        X_knn_best = dp.zip_user_item_pairs(T, item_ids=max_indices)
        y_knn_best = model.predict(X_knn_best)
        T_knn_best[:, i] = np.squeeze(y_knn_best, axis=-1)
        
        # Compute the mask within these kNN part of the training data
        M = np.zeros_like(Pc_i) # np.repeat(Li, Pc_i.size).reshape(Pc_i.shape)
        M[Pc_i > 0] = 1 # polarity > 0 => correct predictions (either TP or TN) => keep their re-estimated values by setting these entries to 1s
        
        # ... polarity < 0 => incorrect predictions => discard by setting them to 0s

        # Get re-estimated values for the kNN (of test instance)
        X_knn = dp.make_user_item_pairs(T, item_ids=knn_idx) # structure k-NN in user-item-pair format for CFNet-based models
        assert X_knn.shape[0] == Pc_i.size
        y_knn = model.predict(X_knn)
        T_knn = y_knn.reshape((n_users, len(knn_idx))) # use len(knn_idx) instead of `k` to consider the flexibility of selecting even fewer candidates
        
        # if i == 10: print(f"[test] knn_idx: {knn_idx}"); print(f"[test] X_knn:\n{X_knn}\n"); print(f"[test] T_knn:\n{T_knn}\n") 
        assert T_knn.shape[1] <= k, f"T_knn[1] == k(NN): {k} but got {T_knn.shape[1]}"
        assert T_knn.shape == Pc_i.shape, f"T_knn is a n_users-by-k matrix but got shape: {T_knn.shape}"
        
        # Method #3 Column Average: Use the average across the re-estimated kNNs
        ti_knn_avg = np.mean(T_knn, axis=1) # take column-wise average (i.e. for each user, take the average among kNNs)
        T_avg[:, i] = ti_knn_avg

        # Method #4 Masked Average: Use the reestimated values w.r.t ONLY those with positive polarity (i.e. averaging from TPs or TNs)
        eps = 1e-4
        ti_knn_masked_avg = (M*T_knn).sum(1)/(M.sum(1)+eps) # take average from non-zero entries only
        T_masked_avg[:, i] = ti_knn_masked_avg

        # Method #5 Adjusted Masked Average: Consider degenerative cases in which, for a given base classifier, 
        #           NONE of its predictions in these kNNs are correct
        #           - It's possible that some classifiers never made correct predictions in the context of these kNNs
        #             - For these rows, their values in M are all zeros
        #             `- Set a default value if that's the case (e.g. average)
        Th[:, i] = np.where(ti_knn_masked_avg == 0, ti_knn_avg, ti_knn_masked_avg)
        
        # Method #6: Mark unreliable entries by -1 (and apply a post-hoc method to Th); post-hoc method is yet to be defined
        Th_reliable[:, i] =  np.where(ti_knn_masked_avg == 0, -1, ti_knn_masked_avg)

    T_pred['T_knn_best'] = T_knn_best # best users
    T_pred['T_avg'] = T_avg # average
    T_pred['T_masked_avg'] = T_masked_avg # masked average
    T_pred['Th'] = Th # adjusted masked average
    T_pred['Th_reliable'] = Th_reliable # -1

    if verbose: 
        print(f"[info] Number of unreliable kNN cases: {n_unreliable_knn_cases}") 
   
    return T_pred
        
# Normalize each data point so that they have a unit length
# R = normalize(R, axis=0, norm='l2')
# T = normalize(T, axis=0, norm='l2')
# test_points = np.random.choice(range(T.shape[1]), 10)
# for t in test_points: 
#     # print(f"norm({t})={LA.norm(T[:, t], 2)}") 
#     assert np.allclose(1.0, LA.norm(T[:, t], 2))

X_train = R.T
X_test = T.T

fknn = FaissKNN(k=10)
fknn.fit(X_train, L_train) # Note: X_train = np.tranpose(R)

assert Pc.shape == R.shape
assert Cw.shape == R.shape
assert len(L_train) == R.shape[1]
T_pred = predict_by_knn(cf_model, fknn, 
                        R, T, L_train, L_test, Cw, Pc, 
                        codes=Polarity.codes, pos_label=1, verbose=1)

# A CF ensemble dataset consists of several parts: original (rating) matrix, re-estimated matrix, ...
# - namedtuple comes in handy
DataSet = namedtuple("DataSet", "X, Xh, L") # declare a `DataSet` type with the attributes: X, Xh and L
Hyperparams = namedtuple("Hyperparams", "alpha, n_factors, policy_threshold, conf_measure")

# The objects associated with traing split, hyperparameters are invariant across different prediction strategies
####################################################
Rh, _ = cm.reestimate(cf_model, R) # We still use cf_model alone to reestimate Rh (no kNN involved)
meta = Hyperparams(policy_threshold=policy_threshold,
                   conf_measure=conf_measure, 
                   alpha=alpha, n_factors=n_factors)
train_split = DataSet(R, Rh, L_train)
####################################################

test_split = DataSet(T, T_pred['T_avg'], L_test) # Seems to have an advantage over the other strategies
highlight(f"(kNN) Average: knn-reestimate the entire T with learned latent factors")
cm.analyze_reestimated_matrices(train_split, test_split, meta=meta, include_stacking=True)

test_split = DataSet(T, T_pred['T_masked_avg'], L_test)
highlight(f"(kNN) Masked Average: kNN-reestimate T using ONLY reliable entries")
cm.analyze_reestimated_matrices(train_split, test_split, meta=meta, include_stacking=True)

test_split = DataSet(T, T_pred['Th'], L_test)
highlight(f"(kNN) Adjusted Masked Average: kNN-reestimate T via 'interpolation'")
# lh, lh_new, p_threshold, p_threshold_new = \
cm.analyze_reestimated_matrices(train_split, test_split, meta=meta, include_stacking=True)

test_split = DataSet(T, T_pred['T_knn_best'], L_test) # Seems to have an advantage over the other strategies
highlight(f"(kNN) Best Users: knn-reestimate T with learned latent factors BUT choose the best classifier predictions among these kNNs")
cm.analyze_reestimated_matrices(train_split, test_split, meta=meta, include_stacking=True)

highlight(f"(kNN) Prediction via majority vote within kNNs (not recommended)")
perf_score = f1_score(test_split.L, T_pred['knn_max'])
print(f'[result] F1 score:  {perf_score}')
from numpy import linalg as LA
from analyzer import is_sparse
from sklearn.preprocessing import normalize
import data_pipeline as dp
import utils_knn as uknn
from collections import namedtuple
from sklearn.metrics import f1_score
# import polarity_models as pmodel
# from polarity_models import Polarity

def predict_by_knn(model, model_knn, R, T, L_train, L_test, C, Pc, codes={}, pos_label=1, verbose=1): 
    """
    
    Parameters 
    ----------
    model: An instance of CFNet that has been pre-trained 
    model_knn: An instance of Faiss KNN model that has been pre-trained

    R:  probability/rating matrix of the training split
    T: 
    L_train:
    L_test: 
    C: 
    Pc: color matrix of the training split  

    """ 
    if verbose: np.set_printoptions(precision=3, edgeitems=5, suppress=True)

    # Convert rating matrices back to typical ML training set format
    X_train = R.T
    X_test = T.T

    # Find kNNs for each test instances in T
    distances, knn_indices = model_knn.search(X_test)

    N, k = knn_indices.shape # `knn_indices` is a k-by-N matrix, where k as in kNN and N is the sample size 
    n_users = T.shape[0]

    assert N == T.shape[1], f"Size of test set: {T.shape[1]} inconsistent what's inferred from knn indices: {N}"
    assert R.shape == Pc.shape

    if len(codes) == 0: codes = Polarity.codes

    if is_sparse(Pc): Pc = Pc.A #

    # Infer true labels (L_train) from color matrix
    L_train = pmodel.color_matrix_to_labels(Pc, codes=codes) # True labels for R
    n_unreliable_knn_cases = 0
    col_user, col_item, col_value = 'user', 'item', 'rating'

    Th = np.zeros_like(T, dtype='float32') # Initialize the re-estimated test set (Th) for T
    T_knn_best = np.zeros_like(T, dtype='float32')
    T_avg = np.zeros_like(T, dtype='float32')
    T_masked_avg = np.zeros_like(T, dtype='float32')
    Th_reliable = np.zeros_like(T, dtype='float32') # unreliable entries are marked by special number (e.g. 0)

    T_pred = {} # keep track of various predictied outputs according to different strategies
    T_pred['knn_max'] = []

    # kNN top of the top (rank kNNs further by their entropy values, the smaller the better)
    # L_knn, top_indices = uknn.estimate_labels_by_rank(model_knn, T, Pc, topn=min(3, k), 
    #                                                    rank_fn=uknn.compute_entropy, 
    #                                                    larger_is_better=False, 
    #                                                    verbose=0)
    msg = ''
    test_points = np.random.choice(range(N), 10)
    for i in tqdm(range(N)):  # foreach position in the test split (T)
        knn_idx = knn_indices[i] # test point (i)'s k nearest neighbors in R (in terms of their indices)
        # knn_idx = top_indices[i]

        Pc_i = Pc[:, knn_idx].astype(int) # subset the color matrix at kNN indices

        # Method #1 Majority vote: Use the label determined by majority vote within kNNs
        L_knn_i = pmodel.color_matrix_to_labels(Pc_i, codes=codes) # kNN's labels
        ti_knn_max = np.argmax( np.bincount(L_knn_i) ) # kNN-predicted label by majority vote
        # ti_knn_max = L_knn[i]
        T_pred['knn_max'].append(ti_knn_max)

        # Gather statistics
        ni = Pc_i.size # ~ T.size
        ntp = np.sum(Pc_i == codes['tp'])
        ntn = np.sum(Pc_i == codes['tn'])
        nfp = np.sum(Pc_i == codes['fp'])
        nfn = np.sum(Pc_i == codes['fn'])

        if (ntp+ntn)==0: # None of the base classifiers (users) made any correct predictions within these kNNs
            n_unreliable_knn_cases += 1

        # [Test]
        if verbose > 1: 
            msg += f"[info] test point index: {i}\n" + '#' * 50 + '\n'
            msg += f"> T({i}):\n{T[:, i]}\n"
            msg += f"> R({i}):\n{R[:, knn_idx[0]]}\n" # point in R closest to the current test point T[:, i]
            msg += f"> Pc_i(shape={Pc_i.shape}):\n{Pc_i}\n"
            msg += f"> L_knn(size={len(L_knn)}):\n{L_knn}\n"
            msg += f"> label prediction (knn) => {ti_knn_max}\n"

        # Method #2 Best uses: foreach base classifier prediction in ti, use the "best" among these kNNs (majority vote followed by restiamte)
        max_colors, max_indices = [], []
        for u in range(n_users): 
            color, pos = uknn.most_common_element_and_position(Pc_i[u, :], pos_key_only=True)
            max_colors.append(color)
            max_indices.append(knn_idx[pos]) # we also want the knn index
        X_knn_best = dp.zip_user_item_pairs(T, item_ids=max_indices)
        y_knn_best = model.predict(X_knn_best)
        T_knn_best[:, i] = np.squeeze(y_knn_best, axis=-1)
        
        # Compute the mask within these kNN part of the training data
        M = np.zeros_like(Pc_i) # np.repeat(Li, Pc_i.size).reshape(Pc_i.shape)
        M[Pc_i > 0] = 1 # polarity > 0 => correct predictions (either TP or TN) => keep their re-estimated values by setting these entries to 1s
        
        # ... polarity < 0 => incorrect predictions => discard by setting them to 0s

        # Get re-estimated values for the kNN (of test instance)
        X_knn = dp.make_user_item_pairs(T, item_ids=knn_idx) # structure k-NN in user-item-pair format for CFNet-based models
        assert X_knn.shape[0] == Pc_i.size
        y_knn = model.predict(X_knn)
        T_knn = y_knn.reshape((n_users, len(knn_idx))) # use len(knn_idx) instead of `k` to consider the flexibility of selecting even fewer candidates
        
        # if i == 10: print(f"[test] knn_idx: {knn_idx}"); print(f"[test] X_knn:\n{X_knn}\n"); print(f"[test] T_knn:\n{T_knn}\n") 
        assert T_knn.shape[1] <= k, f"T_knn[1] == k(NN): {k} but got {T_knn.shape[1]}"
        assert T_knn.shape == Pc_i.shape, f"T_knn is a n_users-by-k matrix but got shape: {T_knn.shape}"
        
        # Method #3 Column Average: Use the average across the re-estimated kNNs
        ti_knn_avg = np.mean(T_knn, axis=1) # take column-wise average (i.e. for each user, take the average among kNNs)
        T_avg[:, i] = ti_knn_avg

        # Method #4 Masked Average: Use the reestimated values w.r.t ONLY those with positive polarity (i.e. averaging from TPs or TNs)
        eps = 1e-4
        ti_knn_masked_avg = (M*T_knn).sum(1)/(M.sum(1)+eps) # take average from non-zero entries only
        T_masked_avg[:, i] = ti_knn_masked_avg

        # Method #5 Adjusted Masked Average: Consider degenerative cases in which, for a given base classifier, 
        #           NONE of its predictions in these kNNs are correct
        #           - It's possible that some classifiers never made correct predictions in the context of these kNNs
        #             - For these rows, their values in M are all zeros
        #             `- Set a default value if that's the case (e.g. average)
        Th[:, i] = np.where(ti_knn_masked_avg == 0, ti_knn_avg, ti_knn_masked_avg)
        
        # Method #6: Mark unreliable entries by -1 (and apply a post-hoc method to Th); post-hoc method is yet to be defined
        Th_reliable[:, i] =  np.where(ti_knn_masked_avg == 0, -1, ti_knn_masked_avg)

    T_pred['T_knn_best'] = T_knn_best # best users
    T_pred['T_avg'] = T_avg # average
    T_pred['T_masked_avg'] = T_masked_avg # masked average
    T_pred['Th'] = Th # adjusted masked average
    T_pred['Th_reliable'] = Th_reliable # -1

    if verbose: 
        print(f"[info] Number of unreliable kNN cases: {n_unreliable_knn_cases}") 
   
    return T_pred
        
# Normalize each data point so that they have a unit length
# R = normalize(R, axis=0, norm='l2')
# T = normalize(T, axis=0, norm='l2')
# test_points = np.random.choice(range(T.shape[1]), 10)
# for t in test_points: 
#     # print(f"norm({t})={LA.norm(T[:, t], 2)}") 
#     assert np.allclose(1.0, LA.norm(T[:, t], 2))

X_train = R.T
X_test = T.T

fknn = FaissKNN(k=10)
fknn.fit(X_train, L_train) # Note: X_train = np.tranpose(R)

assert Pc.shape == R.shape
assert Cw.shape == R.shape
assert len(L_train) == R.shape[1]
T_pred = predict_by_knn(cf_model, fknn, 
                        R, T, L_train, L_test, Cw, Pc, 
                        codes=Polarity.codes, pos_label=1, verbose=1)

# A CF ensemble dataset consists of several parts: original (rating) matrix, re-estimated matrix, ...
# - namedtuple comes in handy
DataSet = namedtuple("DataSet", "X, Xh, L") # declare a `DataSet` type with the attributes: X, Xh and L
Hyperparams = namedtuple("Hyperparams", "alpha, n_factors, policy_threshold, conf_measure")

# The objects associated with traing split, hyperparameters are invariant across different prediction strategies
####################################################
Rh, _ = cm.reestimate(cf_model, R) # We still use cf_model alone to reestimate Rh (no kNN involved)
meta = Hyperparams(policy_threshold=policy_threshold,
                   conf_measure=conf_measure, 
                   alpha=alpha, n_factors=n_factors)
train_split = DataSet(R, Rh, L_train)
####################################################

test_split = DataSet(T, T_pred['T_avg'], L_test) # Seems to have an advantage over the other strategies
highlight(f"(kNN) Average: knn-reestimate the entire T with learned latent factors")
cm.analyze_reestimated_matrices(train_split, test_split, meta=meta, include_stacking=True)

test_split = DataSet(T, T_pred['T_masked_avg'], L_test)
highlight(f"(kNN) Masked Average: kNN-reestimate T using ONLY reliable entries")
cm.analyze_reestimated_matrices(train_split, test_split, meta=meta, include_stacking=True)

test_split = DataSet(T, T_pred['Th'], L_test)
highlight(f"(kNN) Adjusted Masked Average: kNN-reestimate T via 'interpolation'")
# lh, lh_new, p_threshold, p_threshold_new = \
cm.analyze_reestimated_matrices(train_split, test_split, meta=meta, include_stacking=True)

test_split = DataSet(T, T_pred['T_knn_best'], L_test) # Seems to have an advantage over the other strategies
highlight(f"(kNN) Best Users: knn-reestimate T with learned latent factors BUT choose the best classifier predictions among these kNNs")
cm.analyze_reestimated_matrices(train_split, test_split, meta=meta, include_stacking=True)

highlight(f"(kNN) Prediction via majority vote within kNNs (not recommended)")
perf_score = f1_score(test_split.L, T_pred['knn_max'])
print(f'[result] F1 score:  {perf_score}')

100%|██████████| 1250/1250 [02:31<00:00,  8.24it/s]

[info] Number of unreliable kNN cases: 8

================================================================================
(kNN) Average: knn-reestimate the entire T with learned latent factors
================================================================================
[info] From R to Rh, delta(Frobenius norm)= 76.3069947781743
[info] From T to Th, delta(Frobenius norm)= 40.07294361077488
[info] From `p_threshold(R)` to `p_threshold(Rh)`, delta(2-norm)= 1.7429831143509626
...    Original p_threshold:
[0.499 0.    0.008 0.    0.072]

...    New p_threshold:
[1.    1.    0.957 0.696 0.707]

[info] How different are lh and lh_new? 0.46
[result] Majority vote: F1 score with the original T:  0.20470262793914248
[result] Majority vote: F1 score with re-estimated Th using original p_threshold: 0.19364161849710984
[result] Majority vote: F1 score with re-estimated Th: 0.16216216216216214

[result] Stacking: F1 score with the original T:  0.125
[result] Stacking: F1 score with re-estimated Th: 0.18666666666666668

[result] Best settings: lh_maxvote, score: 0.20470262793914248

================================================================================
(kNN) Masked Average: kNN-reestimate T using ONLY reliable entries
================================================================================
[info] From R to Rh, delta(Frobenius norm)= 76.3069947781743
[info] From T to Th, delta(Frobenius norm)= 42.30971377472983
[info] From `p_threshold(R)` to `p_threshold(Rh)`, delta(2-norm)= 1.7429831143509626
...    Original p_threshold:
[0.499 0.    0.008 0.    0.072]

...    New p_threshold:
[1.    1.    0.957 0.696 0.707]

[info] How different are lh and lh_new? 0.4384
[result] Majority vote: F1 score with the original T:  0.20470262793914248
[result] Majority vote: F1 score with re-estimated Th using original p_threshold: 0.21865596790371114
[result] Majority vote: F1 score with re-estimated Th: 0.2057142857142857

[result] Stacking: F1 score with the original T:  0.125
[result] Stacking: F1 score with re-estimated Th: 0.2

[result] Best settings: lh2_maxvote_pth_unadjusted, score: 0.21865596790371114

================================================================================
(kNN) Adjusted Masked Average: kNN-reestimate T via 'interpolation'
================================================================================
[info] From R to Rh, delta(Frobenius norm)= 76.3069947781743
[info] From T to Th, delta(Frobenius norm)= 46.177436660954605
[info] From `p_threshold(R)` to `p_threshold(Rh)`, delta(2-norm)= 1.7429831143509626
...    Original p_threshold:
[0.499 0.    0.008 0.    0.072]

...    New p_threshold:
[1.    1.    0.957 0.696 0.707]

[info] How different are lh and lh_new? 0.4384
[result] Majority vote: F1 score with the original T:  0.20470262793914248
[result] Majority vote: F1 score with re-estimated Th using original p_threshold: 0.19364161849710984
[result] Majority vote: F1 score with re-estimated Th: 0.2057142857142857

[result] Stacking: F1 score with the original T:  0.125
[result] Stacking: F1 score with re-estimated Th: 0.2

[result] Best settings: lh2_maxvote_pth_adjusted, score: 0.2057142857142857

================================================================================
(kNN) Best Users: knn-reestimate T with learned latent factors BUT choose the best classifier predictions among these kNNs
================================================================================
[info] From R to Rh, delta(Frobenius norm)= 76.3069947781743
[info] From T to Th, delta(Frobenius norm)= 46.48002205715579
[info] From `p_threshold(R)` to `p_threshold(Rh)`, delta(2-norm)= 1.7429831143509626
...    Original p_threshold:
[0.499 0.    0.008 0.    0.072]

...    New p_threshold:
[1.    1.    0.957 0.696 0.707]

[info] How different are lh and lh_new? 0.4568
[result] Majority vote: F1 score with the original T:  0.20470262793914248
[result] Majority vote: F1 score with re-estimated Th using original p_threshold: 0.19364161849710984
[result] Majority vote: F1 score with re-estimated Th: 0.2271762208067941

[result] Stacking: F1 score with the original T:  0.125
[result] Stacking: F1 score with re-estimated Th: 0.19889502762430938

[result] Best settings: lh2_maxvote_pth_adjusted, score: 0.2271762208067941

================================================================================
(kNN) Prediction via majority vote within kNNs (not recommended)
================================================================================
[result] F1 score:  0.18666666666666668

Error Analysis¶

Let's observe the color patterns associated with the positive exmaples (the minority class)

In [ ]:

Copied!





from utils_knn import estimate_labels_by_rank, compute_entropy

assert Pc.shape == R.shape
assert T.shape[0] == Pc.shape[0]

topn = 3
# fknn = FaissKNN(k=10)
# fknn.fit(X_train, L_train)
lh, top_indices = estimate_labels_by_rank(fknn, T, Pc, topn=topn, rank_fn=compute_entropy, 
                    larger_is_better=False, 
                    verbose=2)

assert np.array(top_indices).shape[0] == T.shape[1]
assert np.array(top_indices).shape[1] == topn
assert len(lh) == T.shape[1]
from utils_knn import estimate_labels_by_rank, compute_entropy

assert Pc.shape == R.shape
assert T.shape[0] == Pc.shape[0]

topn = 3
# fknn = FaissKNN(k=10)
# fknn.fit(X_train, L_train)
lh, top_indices = estimate_labels_by_rank(fknn, T, Pc, topn=topn, rank_fn=compute_entropy, 
                    larger_is_better=False, 
                    verbose=2)

assert np.array(top_indices).shape[0] == T.shape[1]
assert np.array(top_indices).shape[1] == topn
assert len(lh) == T.shape[1]

[info] Pc_592:
[[-2 -2 -2 -2 -2 -2  2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2  2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2  2 -2 -2 -2]
 [ 1  1  1  1  1  1 -1  1  1  1]
 [-2 -2 -2 -2 -2 -2  2 -2 -2 -2]]

[info] sorted_knn_i (n=3):
[(0.7219280948873623, 0), (0.7219280948873623, 1), (0.7219280948873623, 2)]

[info] top_knn_i:
[3114, 1658, 640]

[info] L_knn(n=3): [0 0 0]
..... top_knn_ij: [0, 1, 2]
..... Pc_592 local:
[[-2 -2 -2]
 [-2 -2 -2]
 [-2 -2 -2]
 [ 1  1  1]
 [-2 -2 -2]]

[info] Pc_650:
[[ 2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [ 2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [ 2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [-1  1  1  1  1  1  1  1  1  1]
 [-1  1  1  1  1  1  1  1  1  1]]

[info] sorted_knn_i (n=3):
[(0.9709505944546688, 0), (0.9709505944546688, 1), (0.9709505944546688, 2)]

[info] top_knn_i:
[3458, 412, 617]

[info] L_knn(n=3): [1 0 0]
..... top_knn_ij: [0, 1, 2]
..... Pc_650 local:
[[ 2 -2 -2]
 [ 2 -2 -2]
 [ 2 -2 -2]
 [-1  1  1]
 [-1  1  1]]

[info] Pc_720:
[[-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [ 1  1  1 -2  1  1  1  1  1  1]
 [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]]

[info] sorted_knn_i (n=3):
[(0.0, 3), (0.7219280948873623, 0), (0.7219280948873623, 1)]

[info] top_knn_i:
[3570, 3101, 1657]

[info] L_knn(n=3): [0 0 0]
..... top_knn_ij: [3, 0, 1]
..... Pc_720 local:
[[-2 -2 -2]
 [-2 -2 -2]
 [-2 -2 -2]
 [-2  1  1]
 [-2 -2 -2]]

[info] Pc_729:
[[-2 -2 -2  2 -2 -2 -2 -2 -2 -2]
 [-2 -2 -2  2 -2 -2 -2 -2 -2 -2]
 [-2 -2 -2  2 -2 -2 -2 -2 -2 -2]
 [-2 -2 -2  2 -2 -2 -2 -2 -2 -2]
 [ 1  1  1 -1  1  1  1 -2 -2  1]]

[info] sorted_knn_i (n=3):
[(0.0, 7), (0.0, 8), (0.7219280948873623, 0)]

[info] top_knn_i:
[807, 548, 775]

[info] L_knn(n=3): [0 0 0]
..... top_knn_ij: [7, 8, 0]
..... Pc_729 local:
[[-2 -2 -2]
 [-2 -2 -2]
 [-2 -2 -2]
 [-2 -2 -2]
 [-2 -2  1]]

[info] Pc_756:
[[-2  2 -2 -2  2 -2 -2 -2  2 -2]
 [-2  2 -2 -2  2 -2 -2 -2  2 -2]
 [-2  2 -2 -2  2 -2 -2 -2  2 -2]
 [ 1 -1  1  1 -1  1  1  1 -1 -2]
 [-2  2 -2 -2  2 -2 -2 -2  2 -2]]

[info] sorted_knn_i (n=3):
[(0.0, 9), (0.7219280948873623, 0), (0.7219280948873623, 1)]

[info] top_knn_i:
[2520, 2721, 1242]

[info] L_knn(n=3): [0 0 1]
..... top_knn_ij: [9, 0, 1]
..... Pc_756 local:
[[-2 -2  2]
 [-2 -2  2]
 [-2 -2  2]
 [-2  1 -1]
 [-2 -2  2]]

[info] Pc_855:
[[-2 -2 -2 -2 -2 -2  2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2  2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2  2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2  2 -2 -2 -2]
 [ 1  1  1  1  1  1 -1  1  1  1]]

[info] sorted_knn_i (n=3):
[(0.7219280948873623, 0), (0.7219280948873623, 1), (0.7219280948873623, 2)]

[info] top_knn_i:
[1622, 839, 789]

[info] L_knn(n=3): [0 0 0]
..... top_knn_ij: [0, 1, 2]
..... Pc_855 local:
[[-2 -2 -2]
 [-2 -2 -2]
 [-2 -2 -2]
 [-2 -2 -2]
 [ 1  1  1]]

[info] Pc_915:
[[-2  2 -2 -2  2  2 -2 -2 -2 -2]
 [-2  2 -2 -2  2  2 -2 -2 -2 -2]
 [-2  2 -2 -2  2  2 -2 -2 -2 -2]
 [-2  2 -2 -2  2  2 -2 -2 -2 -2]
 [ 1 -1  1  1 -1  2 -2  1 -2  1]]

[info] sorted_knn_i (n=3):
[(0.0, 5), (0.0, 6), (0.0, 8)]

[info] top_knn_i:
[3231, 1405, 146]

[info] L_knn(n=3): [1 0 0]
..... top_knn_ij: [5, 6, 8]
..... Pc_915 local:
[[ 2 -2 -2]
 [ 2 -2 -2]
 [ 2 -2 -2]
 [ 2 -2 -2]
 [ 2 -2 -2]]

[info] Pc_1048:
[[-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [ 1  1  1  1  1  1  1  1  1  1]
 [ 1  1  1  1  1  1  1  1  1  1]]

[info] sorted_knn_i (n=3):
[(0.9709505944546688, 0), (0.9709505944546688, 1), (0.9709505944546688, 2)]

[info] top_knn_i:
[2485, 372, 3190]

[info] L_knn(n=3): [0 0 0]
..... top_knn_ij: [0, 1, 2]
..... Pc_1048 local:
[[-2 -2 -2]
 [-2 -2 -2]
 [-2 -2 -2]
 [ 1  1  1]
 [ 1  1  1]]

[info] Pc_1134:
[[-2 -2 -2  2 -2 -2 -2 -2 -2 -2]
 [-2 -2 -2  2 -2 -2 -2 -2 -2 -2]
 [-2 -2 -2  2 -2 -2 -2 -2 -2 -2]
 [ 1  1  1 -1  1 -2  1 -2  1  1]
 [-2 -2 -2  2  1  1 -2  1 -2  1]]

[info] sorted_knn_i (n=3):
[(0.7219280948873623, 0), (0.7219280948873623, 1), (0.7219280948873623, 2)]

[info] top_knn_i:
[1344, 1504, 1804]

[info] L_knn(n=3): [0 0 0]
..... top_knn_ij: [0, 1, 2]
..... Pc_1134 local:
[[-2 -2 -2]
 [-2 -2 -2]
 [-2 -2 -2]
 [ 1  1  1]
 [-2 -2 -2]]

[info] Pc_1227:
[[-2 -2 -2 -2 -2 -2 -2 -2  2 -2]
 [-2 -2 -2 -2 -2 -2 -2 -2  2 -2]
 [-2 -2 -2 -2 -2 -2 -2 -2  2 -2]
 [ 1  1  1  1  1  1  1  1 -1  1]
 [ 1  1  1  1  1  1  1  1 -1  1]]

[info] sorted_knn_i (n=3):
[(0.9709505944546688, 0), (0.9709505944546688, 1), (0.9709505944546688, 2)]

[info] top_knn_i:
[2811, 2698, 2029]

[info] L_knn(n=3): [0 0 0]
..... top_knn_ij: [0, 1, 2]
..... Pc_1227 local:
[[-2 -2 -2]
 [-2 -2 -2]
 [-2 -2 -2]
 [ 1  1  1]
 [ 1  1  1]]

In [ ]:

Copied!

import utils_knn as uknn

X_test = test_split.X.T
L_test = test_split.L
uknn.analyze_knn(fknn, X_test, L_test, Pc, target_label=1)
import utils_knn as uknn

X_test = test_split.X.T
L_test = test_split.L
uknn.analyze_knn(fknn, X_test, L_test, Pc, target_label=1)

> Positive example #1
> Pc_0:
[[ 2  2  2  2 -2 -2  2  2 -2 -2]
 [ 2  2  2  2 -2 -2  2  2 -2 -2]
 [ 2  2  2  2 -2 -2  2  2 -2 -2]
 [ 2  2  2  2 -2 -2  2  2 -2 -2]
 [ 2  2  2  2 -2 -2  2  2 -2 -2]]
> colors:  [2, 2, 2, 2, 2]
> indices: [1793, 1793, 1793, 1793, 1793]
--------------------------------------------------
> Positive example #2
> Pc_1:
[[-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [ 1  1 -2  1  1  1 -2  1  1  1]
 [ 1  1  1  1  1  1  1  1  1  1]]
> colors:  [-2, -2, -2, 1, 1]
> indices: [2422, 2422, 2422, 2422, 2422]
--------------------------------------------------
> Positive example #3
> Pc_2:
[[-2 -2 -2  2 -2 -2 -2  2 -2 -2]
 [-2 -2 -2  2 -2 -2 -2  2 -2 -2]
 [-2 -2 -2  2 -2 -2 -2  2 -2 -2]
 [ 1  1  1 -1  1  1  1 -1  1  1]
 [ 1  1  1 -1  1  1  1 -1  1  1]]
> colors:  [2, 2, 2, 1, 1]
> indices: [3238, 3238, 3238, 1705, 1705]
--------------------------------------------------
> Positive example #4
> Pc_3:
[[-2 -2 -2 -2  2 -2 -2 -2  2 -2]
 [-2 -2 -2 -2  2 -2 -2 -2  2 -2]
 [-2 -2 -2 -2  2 -2 -2 -2  2 -2]
 [-2 -2 -2 -2  2 -2 -2 -2  2 -2]
 [-2 -2 -2 -2  2 -2 -2 -2  2 -2]]
> colors:  [2, 2, 2, 2, 2]
> indices: [743, 743, 743, 743, 743]
--------------------------------------------------
> Positive example #5
> Pc_4:
[[-2  2 -2 -2 -2 -2 -2  2 -2 -2]
 [-2  2 -2 -2 -2 -2 -2  2 -2 -2]
 [-2  2 -2 -2 -2 -2 -2  2 -2 -2]
 [ 1  2  1  1  1  1  1 -1  1  1]
 [ 1 -1  1  1  1  1  1 -1  1  1]]
> colors:  [2, 2, 2, 1, 1]
> indices: [3032, 3032, 3032, 432, 432]
--------------------------------------------------
> Positive example #6
> Pc_5:
[[-2 -2 -2 -2  2 -2 -2 -2 -2 -2]
 [-2 -2 -2 -2  2 -2 -2 -2 -2 -2]
 [-2 -2 -2 -2  2 -2 -2 -2 -2 -2]
 [ 1  1  1  1 -1  1  1  1  1  1]
 [ 1  1  1  1 -1  1  1  1  1  1]]
> colors:  [2, 2, 2, 1, 1]
> indices: [2783, 2783, 2783, 3366, 3366]
--------------------------------------------------
> Positive example #7
> Pc_6:
[[-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [ 1  1  1  1  1  1  1  1  1  1]
 [ 1  1  1  1  1  1  1  1  1  1]]
> colors:  [-2, -2, -2, 1, 1]
> indices: [444, 444, 444, 444, 444]
--------------------------------------------------
> Positive example #8
> Pc_7:
[[-2  2 -2 -2 -2 -2 -2 -2 -2 -2]
 [-2  2 -2 -2 -2 -2 -2 -2 -2 -2]
 [-2  2 -2 -2 -2 -2 -2 -2 -2 -2]
 [ 1 -1  1  1  1  1  1  1  1  1]
 [ 1 -1  1  1  1  1 -2  1  1  1]]
> colors:  [2, 2, 2, 1, 1]
> indices: [3384, 3384, 3384, 2549, 2549]
--------------------------------------------------
> Positive example #9
> Pc_8:
[[-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [ 1  1  1  1  1  1  1  1  1  1]
 [ 1  1  1  1  1  1  1  1  1  1]]
> colors:  [-2, -2, -2, 1, 1]
> indices: [1159, 1159, 1159, 1159, 1159]
--------------------------------------------------
> Positive example #10
> Pc_9:
[[-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [ 1  1  1  1  1  1  1  1  1 -2]]
> colors:  [-2, -2, -2, -2, 1]
> indices: [3479, 3479, 3479, 3479, 3479]
--------------------------------------------------
> Positive example #11
> Pc_10:
[[-2 -2 -2 -2 -2 -2 -2 -2  2 -2]
 [-2 -2 -2 -2 -2 -2 -2 -2  2 -2]
 [-2 -2 -2 -2 -2 -2 -2 -2  2 -2]
 [-2 -2 -2 -2 -2 -2 -2 -2  2 -2]
 [-2 -2 -2 -2 -2 -2 -2 -2  2 -2]]
> colors:  [2, 2, 2, 2, 2]
> indices: [2577, 2577, 2577, 2577, 2577]
--------------------------------------------------
> Positive example #12
> Pc_11:
[[-2 -2  2 -2 -2 -2  2 -2 -2 -2]
 [-2 -2  2 -2 -2 -2  2 -2 -2 -2]
 [-2 -2  2 -2 -2 -2  2 -2 -2 -2]
 [ 1  1 -1  1  1  1 -1  1  1  1]
 [ 1  1 -1  1  1  1 -1  1  1  1]]
> colors:  [2, 2, 2, 1, 1]
> indices: [2547, 2547, 2547, 2146, 2146]
--------------------------------------------------
> Positive example #13
> Pc_12:
[[-2 -2 -2 -2 -2 -2  2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2  2 -2 -2 -2]
 [ 1 -2 -2 -2 -2 -2  2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2  2 -2 -2 -2]
 [-2  1  1  1  1  1 -1 -2  1  1]]
> colors:  [2, 2, 1, 2, 1]
> indices: [3000, 3000, 190, 3000, 231]
--------------------------------------------------
> Positive example #14
> Pc_13:
[[-2 -2  2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2  2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2  2 -2 -2 -2 -2 -2 -2 -2]
 [ 1  1 -1  1  1  1  1  1  1  1]
 [ 1  1 -1  1  1  1  1  1  1  1]]
> colors:  [2, 2, 2, 1, 1]
> indices: [3415, 3415, 3415, 2811, 2811]
--------------------------------------------------
> Positive example #15
> Pc_14:
[[ 2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [ 2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [ 2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [-1  1  1  1  1  1  1  1  1  1]
 [ 2 -2 -2 -2 -2 -2 -2 -2 -2 -2]]
> colors:  [2, 2, 2, 1, 2]
> indices: [2838, 2838, 2838, 3363, 2838]
--------------------------------------------------
> Positive example #16
> Pc_15:
[[-2 -2 -2 -2 -2 -2  2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2  2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2  2 -2 -2 -2]
 [ 1  1  1 -2  1  1 -1  1  1  1]
 [ 1  1  1  1  1  1 -1  1  1  1]]
> colors:  [2, 2, 2, 1, 1]
> indices: [860, 860, 860, 2788, 2788]
--------------------------------------------------
> Positive example #17
> Pc_16:
[[-2 -2 -2 -2  2 -2 -2 -2 -2 -2]
 [-2 -2 -2 -2  2 -2 -2 -2 -2 -2]
 [-2 -2 -2 -2  2 -2 -2 -2 -2 -2]
 [ 1 -2  1  1 -1  1  1  1  1  1]
 [ 1  1  1  1 -1  1  1  1  1  1]]
> colors:  [2, 2, 2, 1, 1]
> indices: [21, 21, 21, 1675, 1675]
--------------------------------------------------
> Positive example #18
> Pc_17:
[[2 2 2 2 2 2 2 2 2 2]
 [2 2 2 2 2 2 2 2 2 2]
 [2 2 2 2 2 2 2 2 2 2]
 [2 2 2 2 2 2 2 2 2 2]
 [2 2 2 2 2 2 2 2 2 2]]
> colors:  [2, 2, 2, 2, 2]
> indices: [1931, 1931, 1931, 1931, 1931]
--------------------------------------------------
> Positive example #19
> Pc_18:
[[-2  2 -2 -2  2 -2 -2 -2 -2 -2]
 [-2  2 -2 -2  2 -2 -2 -2 -2 -2]
 [-2  2 -2 -2  2 -2 -2 -2 -2 -2]
 [ 1 -1  1  1 -1  1  1 -2  1  1]
 [ 1 -1  1  1 -1  1  1  1  1  1]]
> colors:  [2, 2, 2, 1, 1]
> indices: [136, 136, 136, 1703, 1703]
--------------------------------------------------
> Positive example #20
> Pc_19:
[[2 2 2 2 2 2 2 2 2 2]
 [2 2 2 2 2 2 2 2 2 2]
 [2 2 2 2 2 2 2 2 2 2]
 [2 2 2 2 2 2 2 2 2 2]
 [2 2 2 2 2 2 2 2 2 2]]
> colors:  [2, 2, 2, 2, 2]
> indices: [1931, 1931, 1931, 1931, 1931]
--------------------------------------------------
> Positive example #21
> Pc_20:
[[2 2 2 2 2 2 2 2 2 2]
 [2 2 2 2 2 2 2 2 2 2]
 [2 2 2 2 2 2 2 2 2 2]
 [2 2 2 2 2 2 2 2 2 2]
 [2 2 2 2 2 2 2 2 2 2]]
> colors:  [2, 2, 2, 2, 2]
> indices: [509, 509, 509, 509, 509]
--------------------------------------------------
> Positive example #22
> Pc_21:
[[-2 -2 -2 -2 -2 -2  2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2  2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2  2 -2 -2 -2]
 [ 1  1  1 -2  1  1 -1  1  1  1]
 [ 1  1  1  1  1  1 -1  1  1  1]]
> colors:  [2, 2, 2, 1, 1]
> indices: [860, 860, 860, 2788, 2788]
--------------------------------------------------
> Positive example #23
> Pc_22:
[[-2 -2 -2 -2 -2 -2  2 -2  2  2]
 [-2 -2 -2 -2 -2 -2  2 -2  2  2]
 [-2 -2 -2 -2 -2 -2  2 -2  2  2]
 [ 1  1  1  1  1  1 -1  1 -1 -1]
 [ 1  1  1  1  1  1 -1  1 -1 -1]]
> colors:  [2, 2, 2, 1, 1]
> indices: [2447, 2447, 2447, 3410, 3410]
--------------------------------------------------
> Positive example #24
> Pc_23:
[[ 2 -2 -2 -2 -2 -2  2 -2  2 -2]
 [ 2 -2 -2 -2 -2 -2  2 -2  2 -2]
 [ 2 -2 -2 -2 -2 -2  2 -2  2 -2]
 [-1  1  1 -2  1  1 -1  1 -1  1]
 [-1  1  1  1  1  1 -1  1 -1  1]]
> colors:  [2, 2, 2, 1, 1]
> indices: [2796, 2796, 2796, 2562, 2562]
--------------------------------------------------
> Positive example #25
> Pc_24:
[[-2 -2 -2  2 -2 -2 -2 -2 -2 -2]
 [-2 -2 -2  2 -2 -2 -2 -2 -2 -2]
 [-2 -2 -2  2 -2 -2 -2 -2 -2 -2]
 [ 1  1  1 -1  1  1  1  1  1  1]
 [-2 -2 -2  2 -2 -2 -2 -2 -2 -2]]
> colors:  [2, 2, 2, 1, 2]
> indices: [2041, 2041, 2041, 3350, 2041]
--------------------------------------------------
> Positive example #26
> Pc_25:
[[-2  2 -2 -2 -2 -2 -2 -2  2 -2]
 [-2  2 -2 -2 -2 -2 -2 -2  2 -2]
 [-2  2 -2 -2 -2 -2 -2 -2  2 -2]
 [ 1 -1  1  1  1  1  1  1 -1  1]
 [-2  2 -2 -2 -2 -2 -2 -2  2 -2]]
> colors:  [2, 2, 2, 1, 2]
> indices: [3342, 3342, 3342, 3319, 3342]
--------------------------------------------------
> Positive example #27
> Pc_26:
[[ 2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [ 2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [ 2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [-1  1  1  1  1  1  1  1  1  1]
 [-1  1  1  1  1  1  1  1  1  1]]
> colors:  [2, 2, 2, 1, 1]
> indices: [1449, 1449, 1449, 3007, 3007]
--------------------------------------------------
> Positive example #28
> Pc_27:
[[ 2 -2 -2 -2 -2 -2  2 -2  2 -2]
 [ 2 -2 -2 -2 -2 -2  2 -2  2 -2]
 [ 2 -2 -2 -2 -2 -2  2 -2  2 -2]
 [-1  1  1 -2  1  1 -1  1 -1  1]
 [-1  1  1  1  1  1 -1  1 -1  1]]
> colors:  [2, 2, 2, 1, 1]
> indices: [2796, 2796, 2796, 2562, 2562]
--------------------------------------------------
> Positive example #29
> Pc_28:
[[2 2 2 2 2 2 2 2 2 2]
 [2 2 2 2 2 2 2 2 2 2]
 [2 2 2 2 2 2 2 2 2 2]
 [2 2 2 2 2 2 2 2 2 2]
 [2 2 2 2 2 2 2 2 2 2]]
> colors:  [2, 2, 2, 2, 2]
> indices: [855, 855, 855, 855, 855]
--------------------------------------------------
> Positive example #30
> Pc_29:
[[ 2 -2 -2 -2 -2  2 -2 -2  2 -2]
 [ 2 -2 -2 -2 -2  2 -2 -2  2 -2]
 [ 2 -2 -2 -2 -2  2 -2 -2  2 -2]
 [-1  1  1  1  1 -1  1  1 -1  1]
 [-1  1  1  1  1 -1  1  1 -1  1]]
> colors:  [2, 2, 2, 1, 1]
> indices: [2063, 2063, 2063, 205, 205]
--------------------------------------------------
> Positive example #31
> Pc_30:
[[-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [ 1  1  1  1  1  1  1  1  1  1]
 [ 1  1  1  1  1  1  1  1  1  1]]
> colors:  [-2, -2, -2, 1, 1]
> indices: [1873, 1873, 1873, 1873, 1873]
--------------------------------------------------
> Positive example #32
> Pc_31:
[[-2 -2  2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2  2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2  2 -2 -2 -2 -2 -2 -2 -2]
 [ 1  1 -1  1  1  1  1  1  1  1]
 [ 1  1 -1  1  1  1  1  1  1  1]]
> colors:  [2, 2, 2, 1, 1]
> indices: [1972, 1972, 1972, 3468, 3468]
--------------------------------------------------
> Positive example #33
> Pc_32:
[[ 2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [ 2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [ 2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [-1  1  1  1  1  1  1  1  1  1]
 [-1  1  1  1  1  1  1  1  1  1]]
> colors:  [2, 2, 2, 1, 1]
> indices: [3458, 3458, 3458, 412, 412]
--------------------------------------------------
> Positive example #34
> Pc_33:
[[-2 -2 -2 -2 -2 -2 -2 -2  2 -2]
 [-2 -2 -2 -2 -2 -2 -2 -2  2 -2]
 [-2 -2 -2 -2 -2 -2 -2 -2  2 -2]
 [ 1  1  1  1  1  1  1  1 -1  1]
 [ 1  1  1  1  1  1  1  1 -1  1]]
> colors:  [2, 2, 2, 1, 1]
> indices: [1366, 1366, 1366, 1569, 1569]
--------------------------------------------------
> Positive example #35
> Pc_34:
[[-2 -2  2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2  2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2  2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2  2 -2 -2 -2 -2 -2 -2 -2]
 [ 1  1 -1  1  1  1  1  1  1  1]]
> colors:  [2, 2, 2, 2, 1]
> indices: [183, 183, 183, 183, 145]
--------------------------------------------------
> Positive example #36
> Pc_35:
[[-2 -2  2 -2 -2 -2  2 -2 -2 -2]
 [-2 -2  2 -2 -2 -2  2 -2 -2 -2]
 [-2 -2  2 -2 -2 -2  2 -2 -2 -2]
 [ 1  1 -1  1  1  1 -1  1  1  1]
 [ 1  1 -1  1  1  1 -1  1  1  1]]
> colors:  [2, 2, 2, 1, 1]
> indices: [2547, 2547, 2547, 2146, 2146]
--------------------------------------------------
> Positive example #37
> Pc_36:
[[-2  2 -2 -2 -2 -2 -2 -2  2 -2]
 [-2  2 -2 -2 -2 -2 -2 -2  2 -2]
 [-2  2 -2 -2 -2 -2 -2 -2  2 -2]
 [-2  2 -2 -2 -2 -2 -2 -2  2 -2]
 [ 1 -1  1  1  1  1  1  1 -1  1]]
> colors:  [2, 2, 2, 2, 1]
> indices: [2119, 2119, 2119, 2119, 629]
--------------------------------------------------
> Positive example #38
> Pc_37:
[[-2 -2  2 -2 -2 -2  2 -2 -2 -2]
 [-2 -2  2 -2 -2 -2  2 -2 -2 -2]
 [-2 -2  2 -2 -2 -2  2 -2 -2 -2]
 [ 1  1 -1  1  1  1 -1  1  1  1]
 [ 1  1 -1  1  1  1 -1  1  1  1]]
> colors:  [2, 2, 2, 1, 1]
> indices: [2547, 2547, 2547, 2146, 2146]
--------------------------------------------------
> Positive example #39
> Pc_38:
[[-2 -2 -2 -2 -2 -2 -2  2 -2 -2]
 [-2 -2 -2 -2 -2 -2 -2  2 -2 -2]
 [-2 -2 -2 -2 -2 -2 -2  2 -2 -2]
 [ 1  1  1  1  1  1  1 -1  1 -2]
 [-2 -2 -2 -2 -2 -2 -2  2 -2 -2]]
> colors:  [2, 2, 2, 1, 2]
> indices: [1165, 1165, 1165, 3529, 1165]
--------------------------------------------------
> Positive example #40
> Pc_39:
[[-2 -2 -2 -2 -2 -2  2 -2  2  2]
 [-2 -2 -2 -2 -2 -2  2 -2  2  2]
 [-2 -2 -2 -2 -2 -2  2 -2  2  2]
 [ 1  1  1  1  1  1 -1  1 -1 -1]
 [ 1  1  1  1  1  1 -1  1 -1 -1]]
> colors:  [2, 2, 2, 1, 1]
> indices: [2447, 2447, 2447, 3410, 3410]
--------------------------------------------------
> Positive example #41
> Pc_40:
[[ 2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [ 2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [ 2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [ 2 -2  1  1  1  1  1  1  1  1]
 [-1  1  1  1  1  1  1  1  1  1]]
> colors:  [2, 2, 2, 1, 1]
> indices: [1740, 1740, 1740, 3053, 3647]
--------------------------------------------------
> Positive example #42
> Pc_41:
[[-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [ 1  1  1  1  1  1  1  1  1  1]
 [ 1  1  1  1  1  1  1  1  1  1]]
> colors:  [-2, -2, -2, 1, 1]
> indices: [2991, 2991, 2991, 2991, 2991]
--------------------------------------------------
> Positive example #43
> Pc_42:
[[-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [ 1  1  1  1  1  1  1  1  1  1]
 [ 1  1  1  1  1  1  1  1  1  1]]
> colors:  [-2, -2, -2, 1, 1]
> indices: [2331, 2331, 2331, 2331, 2331]
--------------------------------------------------
> Positive example #44
> Pc_43:
[[-2 -2 -2 -2 -2 -2 -2 -2  2 -2]
 [-2 -2 -2 -2 -2 -2 -2 -2  2 -2]
 [-2 -2 -2 -2 -2 -2 -2 -2  2 -2]
 [ 1  1 -2  1  1  1  1  1 -1  1]
 [ 1  1  1  1  1  1  1  1 -1  1]]
> colors:  [2, 2, 2, 1, 1]
> indices: [1865, 1865, 1865, 1692, 1692]
--------------------------------------------------
> Positive example #45
> Pc_44:
[[-2 -2 -2 -2 -2 -2 -2 -2  2 -2]
 [-2 -2 -2 -2 -2 -2 -2 -2  2 -2]
 [-2 -2 -2 -2 -2 -2 -2 -2  2 -2]
 [ 1  1  1  1  1  1  1  1 -1  1]
 [ 1  1  1  1  1  1  1  1 -1  1]]
> colors:  [2, 2, 2, 1, 1]
> indices: [2078, 2078, 2078, 1411, 1411]
--------------------------------------------------
> Positive example #46
> Pc_45:
[[-2 -2 -2 -2 -2 -2  2 -2  2  2]
 [-2 -2 -2 -2 -2 -2  2 -2  2  2]
 [-2 -2 -2 -2 -2 -2  2 -2  2  2]
 [ 1  1  1  1  1  1 -1  1 -1 -1]
 [ 1  1  1  1  1  1 -1  1 -1 -1]]
> colors:  [2, 2, 2, 1, 1]
> indices: [2447, 2447, 2447, 3410, 3410]
--------------------------------------------------
> Positive example #47
> Pc_46:
[[-2  2 -2 -2 -2 -2 -2  2 -2 -2]
 [-2  2 -2 -2 -2 -2 -2  2 -2 -2]
 [-2  2 -2 -2 -2 -2 -2  2 -2 -2]
 [ 1  2  1  1  1  1  1 -1  1  1]
 [ 1 -1  1  1  1  1  1 -1  1  1]]
> colors:  [2, 2, 2, 1, 1]
> indices: [3032, 3032, 3032, 432, 432]
--------------------------------------------------
> Positive example #48
> Pc_47:
[[-2 -2 -2  2 -2 -2 -2  2 -2 -2]
 [-2 -2 -2  2 -2 -2 -2  2 -2 -2]
 [-2 -2 -2  2 -2 -2 -2  2 -2 -2]
 [ 1  1  1 -1  1  1  1 -1  1  1]
 [ 1  1  1 -1  1  1  1 -1  1  1]]
> colors:  [2, 2, 2, 1, 1]
> indices: [3238, 3238, 3238, 1705, 1705]
--------------------------------------------------
> Positive example #49
> Pc_48:
[[-2  2 -2 -2 -2 -2 -2  2 -2 -2]
 [-2  2 -2 -2 -2 -2 -2  2 -2 -2]
 [-2  2 -2 -2 -2 -2 -2  2 -2 -2]
 [ 1  2  1  1  1  1  1 -1  1  1]
 [ 1 -1  1  1  1  1  1 -1  1  1]]
> colors:  [2, 2, 2, 1, 1]
> indices: [3032, 3032, 3032, 432, 432]
--------------------------------------------------
> Positive example #50
> Pc_49:
[[-2  2 -2 -2 -2 -2 -2 -2 -2 -2]
 [-2  2 -2 -2 -2 -2 -2 -2 -2 -2]
 [-2  2 -2 -2 -2 -2 -2 -2 -2 -2]
 [ 1 -1  1  1  1  1 -2  1  1  1]
 [ 1 -1  1  1  1  1  1  1  1  1]]
> colors:  [2, 2, 2, 1, 1]
> indices: [622, 622, 622, 344, 344]
--------------------------------------------------
[info] Found 36 cases for which the majority 'color' does not come from the same training instance

Also observe the color patterns associated with negative examples (majority class)

In [ ]:

Copied!

uknn.analyze_knn(fknn, X_test, L_test, Pc, target_label=0)
uknn.analyze_knn(fknn, X_test, L_test, Pc, target_label=0)

> Negative example #1
> Pc_0:
[[-2 -2 -2 -2  2 -2 -2 -2 -2 -2]
 [-2 -2 -2 -2  2 -2 -2 -2 -2 -2]
 [-2 -2 -2 -2  2 -2 -2 -2 -2 -2]
 [ 1  1  1  1 -1  1  1  1  1  1]
 [ 1  1  1  1 -1  1  1  1  1  1]]
> colors:  [2, 2, 2, 1, 1]
> indices: [906, 906, 906, 123, 123]
--------------------------------------------------
> Negative example #2
> Pc_1:
[[-2 -2  2 -2 -2 -2 -2 -2  2 -2]
 [-2 -2  2 -2 -2 -2 -2 -2  2 -2]
 [-2 -2  2 -2 -2 -2 -2 -2  2  1]
 [-2 -2  2 -2 -2 -2 -2 -2  2 -2]
 [ 1  1 -1  1  1  1  1  1 -1  1]]
> colors:  [2, 2, 2, 2, 1]
> indices: [1301, 1301, 1301, 1301, 889]
--------------------------------------------------
> Negative example #3
> Pc_2:
[[-2 -2 -2 -2 -2 -2 -2 -2  2  2]
 [-2 -2 -2 -2 -2 -2 -2 -2  2  2]
 [-2 -2 -2 -2 -2 -2 -2 -2  2  2]
 [-2  1  1  1  1  1 -2  1 -1 -1]
 [-2  1 -2  1  1  1  1 -2 -1 -1]]
> colors:  [2, 2, 2, 1, 1]
> indices: [2962, 2962, 2962, 2187, 2187]
--------------------------------------------------
> Negative example #4
> Pc_3:
[[-2 -2 -2 -2 -2  2 -2 -2 -2 -2]
 [-2 -2 -2 -2 -2  2 -2 -2 -2 -2]
 [-2 -2 -2 -2 -2  2 -2 -2 -2 -2]
 [-2 -2 -2  1  1 -1  1  1  1  1]
 [ 1  1  1  1  1 -1  1  1  1  1]]
> colors:  [2, 2, 2, 1, 1]
> indices: [2063, 2063, 2063, 2840, 3296]
--------------------------------------------------
> Negative example #5
> Pc_4:
[[-2 -2  2  2 -2 -2 -2 -2 -2  2]
 [-2 -2  2  2 -2 -2 -2 -2 -2  2]
 [-2 -2  2  2 -2 -2 -2 -2 -2  2]
 [ 1  1 -1 -1  1  1  1 -2  1 -1]
 [ 1  1 -1 -1  1  1  1  1  1 -1]]
> colors:  [2, 2, 2, 1, 1]
> indices: [3125, 3125, 3125, 1668, 1668]
--------------------------------------------------
> Negative example #6
> Pc_5:
[[-2  2 -2 -2 -2  2 -2  2  2 -2]
 [-2  2 -2 -2 -2  2 -2  2  2 -2]
 [-2  2 -2 -2 -2  2 -2  2  2 -2]
 [-2  2 -2 -2 -2  2 -2  2  2 -2]
 [-2  2 -2 -2 -2  2 -2  2  2 -2]]
> colors:  [2, 2, 2, 2, 2]
> indices: [2834, 2834, 2834, 2834, 2834]
--------------------------------------------------
> Negative example #7
> Pc_6:
[[-2 -2 -2  2 -2 -2 -2 -2 -2 -2]
 [-2 -2 -2  2 -2 -2 -2 -2 -2 -2]
 [-2 -2 -2  2 -2 -2 -2 -2 -2 -2]
 [-2 -2 -2  2 -2 -2 -2 -2 -2 -2]
 [ 1  1  1 -1  1  1  1  1  1  1]]
> colors:  [2, 2, 2, 2, 1]
> indices: [203, 203, 203, 203, 1259]
--------------------------------------------------
> Negative example #8
> Pc_7:
[[ 2 -2 -2  2 -2 -2 -2 -2 -2 -2]
 [ 2 -2 -2  2 -2 -2 -2 -2 -2 -2]
 [ 2 -2 -2  2 -2 -2 -2 -2 -2 -2]
 [ 2 -2 -2  2 -2 -2 -2 -2 -2 -2]
 [-1  1  1 -1  1  1  1  1  1 -2]]
> colors:  [2, 2, 2, 2, 1]
> indices: [184, 184, 184, 184, 1200]
--------------------------------------------------
> Negative example #9
> Pc_8:
[[-2 -2 -2 -2  2 -2  2 -2 -2 -2]
 [-2 -2 -2 -2  2 -2  2 -2 -2 -2]
 [-2 -2 -2 -2  2 -2  2 -2 -2 -2]
 [-2 -2 -2 -2  2 -2  2 -2 -2 -2]
 [ 1  1  1  1 -1  1 -1  1  1  1]]
> colors:  [2, 2, 2, 2, 1]
> indices: [2558, 2558, 2558, 2558, 2348]
--------------------------------------------------
> Negative example #10
> Pc_9:
[[ 2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [ 2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [ 2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [-1  1  1  1  1  1  1  1  1  1]
 [-1  1  1  1  1  1  1  1  1  1]]
> colors:  [2, 2, 2, 1, 1]
> indices: [1807, 1807, 1807, 304, 304]
--------------------------------------------------
> Negative example #11
> Pc_10:
[[-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [ 1  1  1  1  1  1  1  1  1  1]]
> colors:  [-2, -2, -2, -2, 1]
> indices: [2821, 2821, 2821, 2821, 2821]
--------------------------------------------------
> Negative example #12
> Pc_11:
[[-2 -2 -2 -2  2 -2 -2 -2 -2 -2]
 [-2 -2 -2 -2  2 -2 -2 -2 -2 -2]
 [-2 -2 -2 -2  2 -2 -2 -2 -2 -2]
 [ 1  1  1  1 -1  1  1 -2  1  1]
 [ 1  1  1  1 -1  1  1  1  1  1]]
> colors:  [2, 2, 2, 1, 1]
> indices: [2467, 2467, 2467, 1810, 1810]
--------------------------------------------------
> Negative example #13
> Pc_12:
[[-2 -2 -2 -2  2 -2 -2 -2 -2 -2]
 [-2 -2 -2 -2  2 -2 -2 -2 -2 -2]
 [-2 -2 -2 -2  2 -2 -2 -2 -2 -2]
 [-2 -2 -2 -2  2 -2 -2 -2 -2 -2]
 [ 1  1 -2 -2 -1 -2  1  1  1 -2]]
> colors:  [2, 2, 2, 2, 1]
> indices: [27, 27, 27, 27, 1180]
--------------------------------------------------
> Negative example #14
> Pc_13:
[[-2 -2 -2 -2 -2 -2  2 -2  2 -2]
 [-2 -2 -2 -2 -2 -2  2 -2  2 -2]
 [-2 -2 -2 -2 -2 -2  2 -2  2 -2]
 [ 1  1 -2  1  1  1 -1  1 -1 -2]
 [ 1  1  1  1  1  1 -1  1 -1  1]]
> colors:  [2, 2, 2, 1, 1]
> indices: [1865, 1865, 1865, 437, 437]
--------------------------------------------------
> Negative example #15
> Pc_14:
[[-2 -2 -2 -2 -2 -2  2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2  2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2  2 -2 -2 -2]
 [-2  1  1  1  1  1 -1  1  1  1]
 [ 1  1  1  1  1  1 -1  1  1  1]]
> colors:  [2, 2, 2, 1, 1]
> indices: [1798, 1798, 1798, 1630, 3536]
--------------------------------------------------
> Negative example #16
> Pc_15:
[[-2 -2 -2 -2 -2 -2  2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2  2 -2 -2 -2]
 [-2 -2 -2  1 -2 -2  2 -2  1 -2]
 [ 1  1  1  1  1  1 -1  1  1 -2]
 [ 1  1  1  1  1  1 -1  1  1  1]]
> colors:  [2, 2, 1, 1, 1]
> indices: [1785, 1785, 475, 180, 180]
--------------------------------------------------
> Negative example #17
> Pc_16:
[[-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [ 1  1  1  1  1  1  1  1  1  1]
 [ 1  1  1  1  1  1  1  1  1  1]]
> colors:  [-2, -2, -2, 1, 1]
> indices: [1406, 1406, 1406, 1406, 1406]
--------------------------------------------------
> Negative example #18
> Pc_17:
[[-2 -2 -2  2  2 -2 -2 -2 -2 -2]
 [-2 -2 -2  2  2 -2 -2 -2 -2 -2]
 [-2 -2 -2  2  2 -2 -2 -2 -2 -2]
 [ 1 -2  1 -1 -1  1  1  1  1  1]
 [ 1  1  1 -1 -1  1  1  1  1  1]]
> colors:  [2, 2, 2, 1, 1]
> indices: [2117, 2117, 2117, 940, 940]
--------------------------------------------------
> Negative example #19
> Pc_18:
[[-2 -2  2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2  2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2  2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2  2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2  2 -2 -2 -2 -2 -2 -2 -2]]
> colors:  [2, 2, 2, 2, 2]
> indices: [1316, 1316, 1316, 1316, 1316]
--------------------------------------------------
> Negative example #20
> Pc_19:
[[-2 -2 -2 -2  2 -2 -2  2 -2 -2]
 [-2 -2 -2 -2  2 -2 -2  2 -2 -2]
 [-2 -2 -2 -2  2 -2 -2  2 -2 -2]
 [ 1  1  1  1 -1  1  1 -1  1  1]
 [ 1  1  1  1 -1  1  1 -1  1  1]]
> colors:  [2, 2, 2, 1, 1]
> indices: [2547, 2547, 2547, 1641, 1641]
--------------------------------------------------
> Negative example #21
> Pc_20:
[[ 2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [ 2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [ 2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [-1  1  1  1  1  1  1  1  1  1]
 [ 2 -2 -2 -2 -2 -2 -2 -2 -2 -2]]
> colors:  [2, 2, 2, 1, 2]
> indices: [2218, 2218, 2218, 3626, 2218]
--------------------------------------------------
> Negative example #22
> Pc_21:
[[-2  2 -2  2 -2 -2 -2 -2 -2 -2]
 [-2  2 -2  2 -2 -2 -2 -2 -2 -2]
 [-2  2 -2  2 -2 -2 -2 -2 -2 -2]
 [ 1 -1  1 -1  1 -2 -2  1  1 -2]
 [-2 -1 -2 -1  1  1  1 -2  1  1]]
> colors:  [2, 2, 2, 1, 1]
> indices: [2111, 2111, 2111, 1715, 1511]
--------------------------------------------------
> Negative example #23
> Pc_22:
[[-2 -2 -2 -2 -2 -2  2  2 -2  2]
 [-2 -2 -2 -2 -2 -2  2  2 -2  2]
 [-2 -2 -2 -2 -2 -2  2  2 -2  2]
 [ 1 -2  1  1  1  1 -1 -1  1 -1]
 [ 1  1  1  1  1  1 -1 -1  1 -1]]
> colors:  [2, 2, 2, 1, 1]
> indices: [954, 954, 954, 3455, 3455]
--------------------------------------------------
> Negative example #24
> Pc_23:
[[-2 -2 -2  2  2 -2 -2 -2 -2 -2]
 [-2 -2 -2  2  2 -2 -2 -2 -2 -2]
 [-2 -2 -2  2  2 -2 -2 -2 -2 -2]
 [ 1  1  1 -1  2  1  1  1  1  1]
 [-2 -2 -2  2  2 -2 -2 -2 -2 -2]]
> colors:  [2, 2, 2, 1, 2]
> indices: [2890, 2890, 2890, 128, 2890]
--------------------------------------------------
> Negative example #25
> Pc_24:
[[-2 -2 -2 -2 -2 -2 -2 -2 -2  2]
 [-2 -2 -2 -2 -2 -2 -2 -2 -2  2]
 [-2 -2 -2 -2 -2 -2 -2 -2 -2  2]
 [ 1  1  1  1  1  1  1  1 -2 -1]
 [-2 -2 -2 -2 -2  1 -2 -2 -2  2]]
> colors:  [2, 2, 2, 1, 1]
> indices: [3193, 3193, 3193, 2430, 273]
--------------------------------------------------
> Negative example #26
> Pc_25:
[[-2 -2 -2 -2 -2 -2 -2 -2 -2  2]
 [-2 -2 -2 -2 -2 -2 -2 -2 -2  2]
 [-2 -2 -2 -2 -2 -2 -2 -2 -2  2]
 [ 1  1  1  1  1  1  1 -2  1 -1]
 [ 1  1  1  1  1  1  1  1  1 -1]]
> colors:  [2, 2, 2, 1, 1]
> indices: [1865, 1865, 1865, 1692, 1692]
--------------------------------------------------
> Negative example #27
> Pc_26:
[[-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [ 1  1  1  1 -2  1  1  1  1  1]]
> colors:  [-2, -2, -2, -2, 1]
> indices: [2442, 2442, 2442, 2442, 2442]
--------------------------------------------------
> Negative example #28
> Pc_27:
[[-2  2 -2 -2 -2 -2 -2 -2 -2 -2]
 [-2  2 -2 -2 -2 -2 -2 -2 -2 -2]
 [-2  2 -2 -2 -2 -2 -2 -2 -2 -2]
 [ 1 -1 -2  1 -2  1  1  1 -2  1]
 [-2  2 -2 -2 -2 -2 -2 -2 -2 -2]]
> colors:  [2, 2, 2, 1, 2]
> indices: [3152, 3152, 3152, 3541, 3152]
--------------------------------------------------
> Negative example #29
> Pc_28:
[[ 2 -2  2 -2 -2 -2 -2 -2 -2 -2]
 [ 2 -2  2 -2 -2 -2 -2 -2 -2 -2]
 [ 2 -2  2 -2 -2 -2 -2 -2 -2 -2]
 [-1  1  2  1  1  1  1  1  1  1]
 [-1  1 -1  1  1  1  1  1  1  1]]
> colors:  [2, 2, 2, 1, 1]
> indices: [1586, 1586, 1586, 803, 803]
--------------------------------------------------
> Negative example #30
> Pc_29:
[[-2  2 -2 -2 -2 -2  2 -2 -2 -2]
 [-2  2 -2 -2 -2 -2  2 -2 -2 -2]
 [-2  2 -2 -2 -2 -2  2 -2 -2 -2]
 [ 1 -1  1  1  1  1 -1  1  1 -2]
 [ 1 -1  1  1  1  1 -1  1  1  1]]
> colors:  [2, 2, 2, 1, 1]
> indices: [1998, 1998, 1998, 2118, 2118]
--------------------------------------------------
> Negative example #31
> Pc_30:
[[-2 -2 -2 -2 -2 -2 -2 -2 -2  2]
 [-2 -2 -2 -2 -2 -2 -2 -2 -2  2]
 [-2 -2 -2 -2 -2 -2 -2 -2 -2  2]
 [ 1  1  1  1  1 -2  1  1  1 -1]
 [ 1  1  1  1  1  1  1  1  1 -1]]
> colors:  [2, 2, 2, 1, 1]
> indices: [1802, 1802, 1802, 1315, 1315]
--------------------------------------------------
> Negative example #32
> Pc_31:
[[-2 -2 -2 -2 -2 -2 -2 -2  2  2]
 [-2 -2 -2 -2 -2 -2 -2 -2  2  2]
 [-2 -2 -2 -2 -2 -2 -2 -2  2  2]
 [ 1  1  1  1  1  1  1  1 -1 -1]
 [ 1  1  1  1  1  1  1  1 -1 -1]]
> colors:  [2, 2, 2, 1, 1]
> indices: [1366, 1366, 1366, 2622, 2622]
--------------------------------------------------
> Negative example #33
> Pc_32:
[[-2 -2 -2 -2 -2 -2  2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2  2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2  2 -2 -2 -2]
 [ 1  1 -2  1 -2  1  2  1  1  1]
 [-2  1  1  1 -2 -2 -1 -2  1  1]]
> colors:  [2, 2, 2, 1, 1]
> indices: [348, 348, 348, 381, 2169]
--------------------------------------------------
> Negative example #34
> Pc_33:
[[-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [ 1  1  1  1  1  1  1 -2  1  1]
 [ 1  1  1  1  1  1  1  1  1  1]]
> colors:  [-2, -2, -2, 1, 1]
> indices: [3635, 3635, 3635, 3635, 3635]
--------------------------------------------------
> Negative example #35
> Pc_34:
[[-2  2  2 -2  2 -2 -2 -2 -2 -2]
 [-2  2  2 -2  2 -2 -2 -2 -2 -2]
 [-2  2  2 -2  2 -2 -2 -2 -2 -2]
 [ 1 -1 -1  1 -1  1  1  1  1  1]
 [ 1 -1 -1  1 -1  1  1  1  1  1]]
> colors:  [2, 2, 2, 1, 1]
> indices: [194, 194, 194, 314, 314]
--------------------------------------------------
> Negative example #36
> Pc_35:
[[-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [ 1  1  1  1  1  1  1  1  1  1]
 [ 1  1  1  1  1  1  1  1  1  1]]
> colors:  [-2, -2, -2, 1, 1]
> indices: [3313, 3313, 3313, 3313, 3313]
--------------------------------------------------
> Negative example #37
> Pc_36:
[[-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [ 1  1  1 -2  1  1  1  1  1  1]
 [ 1  1  1  1  1  1 -2 -2  1  1]]
> colors:  [-2, -2, -2, 1, 1]
> indices: [3531, 3531, 3531, 3531, 3531]
--------------------------------------------------
> Negative example #38
> Pc_37:
[[-2  2 -2  2 -2 -2 -2 -2 -2  2]
 [-2  2 -2  2 -2 -2 -2 -2 -2  2]
 [-2  2 -2  2 -2 -2 -2 -2 -2  2]
 [ 1 -1  1 -1  1 -2  1  1  1 -1]
 [-2  2 -2  2 -2 -2 -2 -2 -2  2]]
> colors:  [2, 2, 2, 1, 2]
> indices: [92, 92, 92, 2612, 92]
--------------------------------------------------
> Negative example #39
> Pc_38:
[[-2 -2 -2 -2 -2 -2 -2 -2  2 -2]
 [-2 -2 -2 -2 -2 -2 -2 -2  2 -2]
 [-2 -2 -2 -2 -2 -2 -2 -2  2 -2]
 [ 1  1  1  1  1  1  1  1  2  1]
 [ 1  1  1  1  1  1  1  1 -1  1]]
> colors:  [2, 2, 2, 1, 1]
> indices: [3011, 3011, 3011, 3623, 3623]
--------------------------------------------------
> Negative example #40
> Pc_39:
[[-2  2 -2 -2 -2 -2 -2 -2 -2 -2]
 [-2  2 -2 -2 -2 -2 -2 -2 -2 -2]
 [-2  2 -2 -2 -2 -2 -2 -2 -2 -2]
 [ 1 -1 -2  1 -2  1  1  1 -2  1]
 [-2  2 -2 -2 -2 -2 -2 -2 -2 -2]]
> colors:  [2, 2, 2, 1, 2]
> indices: [3152, 3152, 3152, 3541, 3152]
--------------------------------------------------
> Negative example #41
> Pc_40:
[[-2 -2 -2 -2 -2  2  2 -2 -2 -2]
 [-2 -2 -2 -2 -2  2  2 -2 -2 -2]
 [-2 -2 -2 -2 -2  2  2 -2 -2 -2]
 [ 1  1 -2  1  1 -1 -1  1 -2  1]
 [ 1  1  1  1  1 -1 -1  1  1  1]]
> colors:  [2, 2, 2, 1, 1]
> indices: [2316, 2316, 2316, 1606, 1606]
--------------------------------------------------
> Negative example #42
> Pc_41:
[[-2 -2 -2 -2 -2 -2  2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2  2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2  2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2  2 -2 -2 -2]
 [ 1  1  1  1  1  1 -1  1  1  1]]
> colors:  [2, 2, 2, 2, 1]
> indices: [944, 944, 944, 944, 124]
--------------------------------------------------
> Negative example #43
> Pc_42:
[[-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [ 1  1  1  1  1  1  1  1  1  1]
 [ 1  1  1  1  1  1  1  1  1  1]]
> colors:  [-2, -2, -2, 1, 1]
> indices: [1352, 1352, 1352, 1352, 1352]
--------------------------------------------------
> Negative example #44
> Pc_43:
[[ 2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [ 2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [ 2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [ 2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [ 2 -2 -2 -2 -2 -2  1 -2 -2  1]]
> colors:  [2, 2, 2, 2, 1]
> indices: [1194, 1194, 1194, 1194, 567]
--------------------------------------------------
> Negative example #45
> Pc_44:
[[-2 -2  2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2  2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2  2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2  2 -2 -2 -2 -2 -2 -2 -2]
 [ 1  1 -1  1  1  1  1  1  1  1]]
> colors:  [2, 2, 2, 2, 1]
> indices: [3000, 3000, 3000, 3000, 585]
--------------------------------------------------
> Negative example #46
> Pc_45:
[[-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [ 1  1  1  1  1  1  1  1  1  1]]
> colors:  [-2, -2, -2, -2, 1]
> indices: [403, 403, 403, 403, 403]
--------------------------------------------------
> Negative example #47
> Pc_46:
[[-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [ 1  1 -2  1  1  1  1  1  1  1]
 [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]]
> colors:  [-2, -2, -2, 1, -2]
> indices: [1781, 1781, 1781, 1781, 1781]
--------------------------------------------------
> Negative example #48
> Pc_47:
[[ 2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [ 2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [ 2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [ 2  1  1  1  1 -2  1  1  1 -2]
 [-1  1  1 -2  1  1 -2 -2  1 -2]]
> colors:  [2, 2, 2, 1, 1]
> indices: [348, 348, 348, 2860, 2860]
--------------------------------------------------
> Negative example #49
> Pc_48:
[[-2  2 -2 -2  2 -2 -2  2 -2 -2]
 [-2  2 -2 -2  2 -2 -2  2 -2 -2]
 [-2  2 -2 -2  2 -2 -2  2 -2 -2]
 [ 1 -1  1 -2 -1  1  1 -1  1  1]
 [ 1 -1  1  1  2  1 -2 -1  1  1]]
> colors:  [2, 2, 2, 1, 1]
> indices: [1551, 1551, 1551, 3333, 3333]
--------------------------------------------------
> Negative example #50
> Pc_49:
[[ 2 -2 -2 -2  2 -2 -2 -2 -2 -2]
 [ 2 -2 -2 -2  2 -2 -2 -2 -2 -2]
 [ 2 -2 -2 -2  2 -2 -2 -2 -2 -2]
 [-1  1  1  1 -1  1  1  1  1  1]
 [ 2 -2 -2 -2  2 -2 -2 -2 -2 -2]]
> colors:  [2, 2, 2, 1, 2]
> indices: [3587, 3587, 3587, 2035, 3587]
--------------------------------------------------
[info] Found 39 cases for which the majority 'color' does not come from the same training instance

In [ ]:

Copied!





import itertools
distances, indices = fknn.search(test_split.X.T)
n_test = indices.shape[0]
i = np.random.choice(range(n_test), 1)[0]
pairs = itertools.product([indices[i][0], ], indices[i, :])
for i, (u, v) in enumerate(list(pairs)): 
    print(f"Pair #{i}: {u} vs {v}")
    print(X_train[u])
    print(X_train[v])
    print("-" * 50)
    print("> Distance=", LA.norm(X_train[u]-X_train[v], 2), '\n')
import itertools
distances, indices = fknn.search(test_split.X.T)
n_test = indices.shape[0]
i = np.random.choice(range(n_test), 1)[0]
pairs = itertools.product([indices[i][0], ], indices[i, :])
for i, (u, v) in enumerate(list(pairs)): 
    print(f"Pair #{i}: {u} vs {v}")
    print(X_train[u])
    print(X_train[v])
    print("-" * 50)
    print("> Distance=", LA.norm(X_train[u]-X_train[v], 2), '\n')

Pair #0: 2528 vs 2528
[0.501 0.    0.124 0.    0.003]
[0.501 0.    0.124 0.    0.003]
--------------------------------------------------
> Distance= 0.0 

Pair #1: 2528 vs 2103
[0.501 0.    0.124 0.    0.003]
[0.5   0.    0.126 0.    0.004]
--------------------------------------------------
> Distance= 0.0027999465682221406 

Pair #2: 2528 vs 750
[0.501 0.    0.124 0.    0.003]
[0.5   0.    0.126 0.    0.004]
--------------------------------------------------
> Distance= 0.0030372388203489453 

Pair #3: 2528 vs 1140
[0.501 0.    0.124 0.    0.003]
[0.5   0.    0.122 0.    0.003]
--------------------------------------------------
> Distance= 0.001741746473479801 

Pair #4: 2528 vs 600
[0.501 0.    0.124 0.    0.003]
[0.501 0.    0.127 0.    0.002]
--------------------------------------------------
> Distance= 0.0032291528295385515 

Pair #5: 2528 vs 2159
[0.501 0.    0.124 0.    0.003]
[0.5   0.    0.127 0.    0.002]
--------------------------------------------------
> Distance= 0.0037394488048762384 

Pair #6: 2528 vs 1542
[0.501 0.    0.124 0.    0.003]
[0.5   0.    0.124 0.    0.006]
--------------------------------------------------
> Distance= 0.0034048678280782113 

Pair #7: 2528 vs 3598
[0.501 0.    0.124 0.    0.003]
[0.499 0.    0.124 0.    0.006]
--------------------------------------------------
> Distance= 0.003679200041038483 

Pair #8: 2528 vs 2056
[0.501 0.    0.124 0.    0.003]
[0.5   0.    0.127 0.    0.005]
--------------------------------------------------
> Distance= 0.004036093463855927 

Pair #9: 2528 vs 3211
[0.501 0.    0.124 0.    0.003]
[0.499 0.    0.127 0.    0.005]
--------------------------------------------------
> Distance= 0.004580235491064864

Guesstimating the labeling for test split (T)¶

Estimated labeling (lh) vs true labels (y_true)

In [ ]:

Copied!





# Restore confidence matrices (in case if modified)
Pc, C0, Cw, Cn, *rest = \
    uc.evalConfidenceMatrices(R, L_train, alpha=alpha, 
                                    p_threshold=p_threshold, 
                                    conf_measure=conf_measure, policy_threshold=policy_threshold, 
                                    
                                    # Optional debug/test parameters 
                                    U=U, n_train=n_train, fold_number=fold_number, 
                                    is_cascade=True,
                                    verbose=0)
assert (Pc.shape == R.shape) and (Cn.shape == R.shape)
# Restore confidence matrices (in case if modified)
Pc, C0, Cw, Cn, *rest = \
    uc.evalConfidenceMatrices(R, L_train, alpha=alpha, 
                                    p_threshold=p_threshold, 
                                    conf_measure=conf_measure, policy_threshold=policy_threshold, 
                                    
                                    # Optional debug/test parameters 
                                    U=U, n_train=n_train, fold_number=fold_number, 
                                    is_cascade=True,
                                    verbose=0)
assert (Pc.shape == R.shape) and (Cn.shape == R.shape)

(make_cn) Using WEIGHTED confidence matrix to approximate ratings ...

Guesstimated labeling of T via majority vote¶

In [ ]:

Copied!





def f_score(precision, recall, beta=1.0):
    f_beta = (1 + beta**2) * (precision * recall) / ((beta**2 * precision) + recall) 
    return f_beta
# from common import f_score

# Color matric and labeling matrix under gold standard
Pc_true, Lh = pmodel.color_matrix(T, L_test, p_threshold)

# labeling by majority vote
lh_max_vote = uc.estimateLabels(T, p_th=p_threshold, pos_label=1)
acc_max_vote = np.sum(lh_max_vote == L_test) / (len(L_test)+0.0)

# workflow: p_threshold -> lh -> color matrix
Pc_maxvote, Lh0 = pmodel.color_matrix(T, lh_max_vote, p_threshold) # Mc: Color matrix evaluated via estimated labels 
Pf_maxvote = pmodel.to_preference(Pc_maxvote, neutral=0.0)
# => {TP, TN}-entries are desirable and thus encoded as 1s in `Pf_maxvote` whereas {FP, FN}-entries are not desirable hence encoded as 0s

metrics = pmodel.eval_estimated_probability_filter(Pf_maxvote, T, L_test, p_threshold, eps=1e-3)

highlight("Guesstimated labeling (on T) via majority vote")
print(f"> Labeling accuracy: {acc_max_vote}")
print(f"> Reliable-to-correct ratio: {metrics['p_overlap']}") # Fraction of entries predicted reliable and are actually correct (TPs or TNs)
print(f"> Precision: {metrics['precision']}, Recall: {metrics['recall']}")
print(f"> Predcitio(TP): {metrics['precision_tp']}, Recall(TP): {metrics['recall_tp']} => f1(TP): {f_score(metrics['precision_tp'], metrics['recall_tp'])}")
print(f"> Error rate: {metrics['p_missed']}") # Probability of predicting reliable but hitting either FPs or FNs
def f_score(precision, recall, beta=1.0):
    f_beta = (1 + beta**2) * (precision * recall) / ((beta**2 * precision) + recall) 
    return f_beta
# from common import f_score

# Color matric and labeling matrix under gold standard
Pc_true, Lh = pmodel.color_matrix(T, L_test, p_threshold)

# labeling by majority vote
lh_max_vote = uc.estimateLabels(T, p_th=p_threshold, pos_label=1)
acc_max_vote = np.sum(lh_max_vote == L_test) / (len(L_test)+0.0)

# workflow: p_threshold -> lh -> color matrix
Pc_maxvote, Lh0 = pmodel.color_matrix(T, lh_max_vote, p_threshold) # Mc: Color matrix evaluated via estimated labels 
Pf_maxvote = pmodel.to_preference(Pc_maxvote, neutral=0.0)
# => {TP, TN}-entries are desirable and thus encoded as 1s in `Pf_maxvote` whereas {FP, FN}-entries are not desirable hence encoded as 0s

metrics = pmodel.eval_estimated_probability_filter(Pf_maxvote, T, L_test, p_threshold, eps=1e-3)

highlight("Guesstimated labeling (on T) via majority vote")
print(f"> Labeling accuracy: {acc_max_vote}")
print(f"> Reliable-to-correct ratio: {metrics['p_overlap']}") # Fraction of entries predicted reliable and are actually correct (TPs or TNs)
print(f"> Precision: {metrics['precision']}, Recall: {metrics['recall']}")
print(f"> Predcitio(TP): {metrics['precision_tp']}, Recall(TP): {metrics['recall_tp']} => f1(TP): {f_score(metrics['precision_tp'], metrics['recall_tp'])}")
print(f"> Error rate: {metrics['p_missed']}") # Probability of predicting reliable but hitting either FPs or FNs

================================================================================
Guesstimated labeling (on T) via majority vote
================================================================================
> Labeling accuracy: 0.54
> Reliable-to-correct ratio: 0.54
> Precision: 0.5288064483522509, Recall: 0.6634396052147127
> Predcitio(TP): 0.06301438708477698, Recall(TP): 0.6712310377231844 => f1(TP): 0.1152127367915651
> Error rate: 0.00012119165978768243

Guesstimated labeling of T via kNNs¶

In [ ]:

Copied!





# We already have the color matrix for the training split (R) but let's verify 
Pc_verify, Lh_train = pmodel.color_matrix(R, L_train, p_threshold) 
Pc_train = Pc.A if is_sparse(Pc) else Pc
assert np.all(Pc_verify == Pc_train)

# Color matrix for T can only be estimated as we do not know `L_test` in general
ratios = uknn.estimate_ratios(fknn, R, Pc, n_samples=30)
print(ratios)

# lh_color = pmodel.color_matrix_to_labels(Pc)
# acc_color = np.sum(lh_color == L_test) / (len(L_test)+0.0)
# We already have the color matrix for the training split (R) but let's verify 
Pc_verify, Lh_train = pmodel.color_matrix(R, L_train, p_threshold) 
Pc_train = Pc.A if is_sparse(Pc) else Pc
assert np.all(Pc_verify == Pc_train)

# Color matrix for T can only be estimated as we do not know `L_test` in general
ratios = uknn.estimate_ratios(fknn, R, Pc, n_samples=30)
print(ratios)

# lh_color = pmodel.color_matrix_to_labels(Pc)
# acc_color = np.sum(lh_color == L_test) / (len(L_test)+0.0)

{1: 0.6760969445202233, 0: 0.6760969445202233}

In [ ]:

Copied!





def color_vector(col_vec, label, p_th, reduced_negative=False, pos_label=1, neg_label=0):
    """

    """
    col_vec = np.asarray(col_vec)
    # if col_vec.ndim == 1: 
    #     pass # no-op
    if col_vec.ndim == 2: 
        assert np.squeeze(col_vec).ndim == 1
    col_vec = col_vec.reshape(-1, 1) # turn into a column vector

    Pc_i, Lh_i = pmodel.color_matrix(col_vec, np.asarray([label, ]), p_th=p_th)
    colors = np.squeeze(Pc_i)
    assert colors.ndim == 1
    
    return colors

print(list(R[:, 3]))
print(color_vector(R[:, 3], label=1, p_th=p_threshold))
print(color_vector(R[:, 3], label=0, p_th=p_threshold))

matching_fn = uknn.estimate_labels_by_matching(fknn, R, Pc, p_threshold, verbose=1)
lh_knn = matching_fn(T)
acc_knn = np.sum(lh_knn == L_test) / (len(L_test)+0.0)

Pc_knn, Lh1 = pmodel.color_matrix(T, lh_knn, p_threshold) # Mc: Color matrix evaluated via estimated labels 
Pf_knn = pmodel.to_preference(Pc_knn, neutral=0.0)

metrics = pmodel.eval_estimated_probability_filter(Pf_knn, T, L_test, p_threshold, eps=1e-3)
highlight("Guesstimated labeling (on T) via kNN")
print(f"> Labeling accuracy: {acc_knn}")
print(f"> Reliable-to-correct ratio: {metrics['p_overlap']}") # Fraction of entries predicted reliable and are actually correct (TPs or TNs)
# ... downside is that high accuracy could be due to TNs but not TPs 

print(f"> Precision: {metrics['precision']}, Recall: {metrics['recall']}")
print(f"> Predcitio(TP): {metrics['precision_tp']}, Recall(TP): {metrics['recall_tp']} => f1(TP): {f_score(metrics['precision_tp'], metrics['recall_tp'])}")

print(f"> Error rate: {metrics['p_missed']}") # Probability of predicting reliable but hitting either FPs or FNs
def color_vector(col_vec, label, p_th, reduced_negative=False, pos_label=1, neg_label=0):
    """

    """
    col_vec = np.asarray(col_vec)
    # if col_vec.ndim == 1: 
    #     pass # no-op
    if col_vec.ndim == 2: 
        assert np.squeeze(col_vec).ndim == 1
    col_vec = col_vec.reshape(-1, 1) # turn into a column vector

    Pc_i, Lh_i = pmodel.color_matrix(col_vec, np.asarray([label, ]), p_th=p_th)
    colors = np.squeeze(Pc_i)
    assert colors.ndim == 1
    
    return colors

print(list(R[:, 3]))
print(color_vector(R[:, 3], label=1, p_th=p_threshold))
print(color_vector(R[:, 3], label=0, p_th=p_threshold))

matching_fn = uknn.estimate_labels_by_matching(fknn, R, Pc, p_threshold, verbose=1)
lh_knn = matching_fn(T)
acc_knn = np.sum(lh_knn == L_test) / (len(L_test)+0.0)

Pc_knn, Lh1 = pmodel.color_matrix(T, lh_knn, p_threshold) # Mc: Color matrix evaluated via estimated labels 
Pf_knn = pmodel.to_preference(Pc_knn, neutral=0.0)

metrics = pmodel.eval_estimated_probability_filter(Pf_knn, T, L_test, p_threshold, eps=1e-3)
highlight("Guesstimated labeling (on T) via kNN")
print(f"> Labeling accuracy: {acc_knn}")
print(f"> Reliable-to-correct ratio: {metrics['p_overlap']}") # Fraction of entries predicted reliable and are actually correct (TPs or TNs)
# ... downside is that high accuracy could be due to TNs but not TPs 

print(f"> Precision: {metrics['precision']}, Recall: {metrics['recall']}")
print(f"> Predcitio(TP): {metrics['precision_tp']}, Recall(TP): {metrics['recall_tp']} => f1(TP): {f_score(metrics['precision_tp'], metrics['recall_tp'])}")

print(f"> Error rate: {metrics['p_missed']}") # Probability of predicting reliable but hitting either FPs or FNs

[0.5005034693335313, 0.0, 0.08009972560479113, 4.519121183621038e-16, 0.0008115769903625958]
[ 2.  2.  2. -1. -1.]
[-2. -2. -2.  1.  1.]
[info] Pc_i:
[[-2 -2  2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2  2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2  2 -2 -2 -2 -2 -2 -2 -2]
 [ 1  1 -1  1  1  1  1  1  1  1]
 [ 1  1 -1  1  1  1  1  1  1  1]]

... Label = 1
... Color(ti): [-1.  2.  2. -1. -1.]
... N_matches(ti): 0
...... sum distances: 9.2
... Label = 0
... Color(ti): [ 1. -2. -2.  1.  1.]
... N_matches(ti): 0
...... sum distances: 2.8000000000000003

--------------------------------------------------
[info] Pc_i:
[[-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2  1 -2  1 -2]
 [ 1  1  1  1  1  1  1  1  1  1]
 [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]]

... Label = 1
... Color(ti): [-1.  2.  2. -1.  2.]
... N_matches(ti): 0
...... sum distances: 10.0
... Label = 0
... Color(ti): [ 1. -2. -2.  1. -2.]
... N_matches(ti): 0
...... sum distances: 2.4000000000000004

--------------------------------------------------
[info] Pc_i:
[[-2 -2 -2  2 -2 -2 -2 -2 -2 -2]
 [-2 -2 -2  2 -2 -2 -2 -2 -2 -2]
 [-2 -2 -2  2 -2 -2 -2 -2 -2 -2]
 [-2 -2 -2  2 -2 -2 -2 -2 -2 -2]
 [ 1  1  1 -1  1  1  1  1  1  1]]

... Label = 1
... Color(ti): [-1.  2.  2.  2. -1.]
... N_matches(ti): 0
...... sum distances: 9.2
... Label = 0
... Color(ti): [ 1. -2. -2. -2.  1.]
... N_matches(ti): 0
...... sum distances: 2.8000000000000007

--------------------------------------------------
[info] Pc_i:
[[-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [ 1  1  1  1  1  1  1  1  1  1]
 [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]]

... Label = 1
... Color(ti): [-1.  2.  2. -1.  2.]
... N_matches(ti): 0
...... sum distances: 10.0
... Label = 0
... Color(ti): [ 1. -2. -2.  1. -2.]
... N_matches(ti): 0
...... sum distances: 1.9999999999999998

--------------------------------------------------
[info] Pc_i:
[[-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [ 1  1  1  1  1  1  1  1  1  1]
 [ 1  1  1  1  1  1  1  1  1  1]]

... Label = 1
... Color(ti): [-1.  2.  2. -1. -1.]
... N_matches(ti): 0
...... sum distances: 10.0
... Label = 0
... Color(ti): [ 1. -2. -2.  1.  1.]
... N_matches(ti): 0
...... sum distances: 1.9999999999999998

--------------------------------------------------
[info] Pc_i:
[[-2 -2 -2 -2 -2 -2 -2  2 -2 -2]
 [-2 -2 -2 -2 -2 -2 -2  2 -2 -2]
 [-2 -2 -2 -2 -2 -2 -2  2 -2 -2]
 [ 1  1  1  1  1  1  1 -1  1  1]
 [-2 -2 -2 -2 -2 -2 -2  2 -2 -2]]

... Label = 1
... Color(ti): [-1.  2.  2. -1.  2.]
... N_matches(ti): 0
...... sum distances: 9.2
... Label = 0
... Color(ti): [ 1. -2. -2.  1. -2.]
... N_matches(ti): 0
...... sum distances: 2.8000000000000003

--------------------------------------------------
[info] Pc_i:
[[-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
 [ 1  1  1  1  1  1  1 -2 -2  1]
 [-2 -2 -2 -2 -2 -2 -2 -2 -2 -2]]

... Label = 1
... Color(ti): [-1.  2.  2. -1.  2.]
... N_matches(ti): 0
...... sum distances: 10.0
... Label = 0
... Color(ti): [ 1. -2. -2.  1. -2.]
... N_matches(ti): 0
...... sum distances: 2.4

--------------------------------------------------
[info] Pc_i:
[[-2 -2 -2 -2 -2 -2 -2  2 -2 -2]
 [-2 -2 -2 -2 -2 -2 -2  2 -2 -2]
 [-2 -2 -2 -2 -2 -2 -2  2 -2 -2]
 [ 1  1  1  1  1  1  1 -1  1  1]
 [ 1  1  1  1  1  1  1 -1  1  1]]

... Label = 1
... Color(ti): [-1.  2.  2. -1. -1.]
... N_matches(ti): 0
...... sum distances: 9.2
... Label = 0
... Color(ti): [ 1. -2. -2.  1.  1.]
... N_matches(ti): 0
...... sum distances: 2.8000000000000003

--------------------------------------------------
[info] Pc_i:
[[-2 -2 -2  2 -2  2 -2 -2  2 -2]
 [-2 -2 -2  2 -2  2 -2 -2  2 -2]
 [-2 -2 -2  2 -2  2 -2 -2  2 -2]
 [ 1  1  1 -1  1 -1  1  1 -1  1]
 [ 1  1  1 -1  1 -1  1  1 -1  1]]

... Label = 1
... Color(ti): [-1.  2.  2.  2. -1.]
... N_matches(ti): 0
...... sum distances: 8.200000000000001
... Label = 0
... Color(ti): [ 1. -2. -2. -2.  1.]
... N_matches(ti): 0
...... sum distances: 5.800000000000001

--------------------------------------------------
[info] Pc_i:
[[-2 -2  2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2  2 -2 -2 -2 -2 -2 -2 -2]
 [-2 -2  2 -2 -2 -2 -2 -2 -2 -2]
 [ 1  1 -1  1  1  1  1  1  1  1]
 [ 1  1 -1  1  1  1  1  1  1  1]]

... Label = 1
... Color(ti): [-1.  2.  2. -1. -1.]
... N_matches(ti): 0
...... sum distances: 9.2
... Label = 0
... Color(ti): [ 1. -2. -2.  1.  1.]
... N_matches(ti): 0
...... sum distances: 2.8000000000000003

--------------------------------------------------
================================================================================
Guesstimated labeling (on T) via kNN
================================================================================
> Labeling accuracy: 0.9008
> Reliable-to-correct ratio: 0.9008
> Precision: 0.901522221729115, Recall: 0.8980313333232225
> Predcitio(TP): 0.018140583692716653, Recall(TP): 0.15342423719387072 => f1(TP): 0.03244494064603671
> Error rate: 3.19006875384525e-05

Use kNN-estimated labeling and color matrix as a probability filter¶

A. Prepare the data

In [ ]:

Copied!





import utils_stacking as ustk

assert Pc.shape == R.shape, "At this point the `Pc` should only hold the colors for R"
R = train_split.X
L_train = train_split.L
T = test_split.X

# L = np.hstack([L_train, lh_knn]) # Note that the labeling for the test split is just an estimate (previously we used majority vote; here we use kNNs)
Pc, C0, Cw, Cn, *rest = uc.eval_confidence_given_color_matrix(
                                           X=(R, T), 
                                           L=(L_train, lh_knn), 
                                           Pc=(Pc, Pc_knn), 
                                           # n_train = R.shape[1], # if (R, T) is passed instead of an already-merged X, then n_train is not needed
                                           alpha=alpha, 
                                           p_threshold=p_threshold, 
                                           conf_measure=conf_measure, 
                                           policy_threshold=policy_threshold)

ustk.verify_shape(X, R, T, L, U, p_threshold) # verify the shape of all key quantities
# Check: Wherever Pc is negative, the corresponding entries in Cn must be 0 (By constrast, C is a full/dense confidence matrix)
assert np.all(Cn[Pc < 0]==0)
assert np.all(Cn[Pc > 0]>0)

X = np.hstack([R, T])
L = np.hstack([L_train, lh_knn])
import utils_stacking as ustk

assert Pc.shape == R.shape, "At this point the `Pc` should only hold the colors for R"
R = train_split.X
L_train = train_split.L
T = test_split.X

# L = np.hstack([L_train, lh_knn]) # Note that the labeling for the test split is just an estimate (previously we used majority vote; here we use kNNs)
Pc, C0, Cw, Cn, *rest = uc.eval_confidence_given_color_matrix(
                                           X=(R, T), 
                                           L=(L_train, lh_knn), 
                                           Pc=(Pc, Pc_knn), 
                                           # n_train = R.shape[1], # if (R, T) is passed instead of an already-merged X, then n_train is not needed
                                           alpha=alpha, 
                                           p_threshold=p_threshold, 
                                           conf_measure=conf_measure, 
                                           policy_threshold=policy_threshold)

ustk.verify_shape(X, R, T, L, U, p_threshold) # verify the shape of all key quantities
# Check: Wherever Pc is negative, the corresponding entries in Cn must be 0 (By constrast, C is a full/dense confidence matrix)
assert np.all(Cn[Pc < 0]==0)
assert np.all(Cn[Pc > 0]>0)

X = np.hstack([R, T])
L = np.hstack([L_train, lh_knn])

(make_cn) Using WEIGHTED confidence matrix to approximate ratings ...

Train CFNet with the new data¶

R stays the same
Quantities associated with T (e.g. lh_knn, Pc_knn) are merged with their counterparts associated with training split

In [ ]:

Copied!





import cf_models as cm

n_users, n_items = X.shape

fold_number = 0
test_size = 0.1

policy_threshold = 'fmax'
conf_measure = 'brier'
n_factors = 100
alpha = 100

lr = 0.001 
batch_size = 64
epochs = 200

loss_fn = tf.keras.losses.BinaryCrossentropy() # Options: tf.keras.losses.BinaryCrossentropy(), tf.keras.losses.MeanSquaredError(), ...

# Configure `target_type` (Options: 'generic', 'rating', 'label')
# 1. Choose 'label' if the BCE loss is used (because the CF model in this case attempts to approximates the label encoded in 0 and 1)
# 2. Choose 'rating' if MSE is used (because the CF model in this case approximates the rating, which is a regression problem)
# 3. Choose 'generic' for customized loss function with potentially more complex labeling information where "y_true" is a matrix 
# 
# Note that you are unlikely need to configure `target_type` because cf_models module has a method that will determine this for you automatically
# target_type = 'label' # if we use BCE, then the model approximates the label

cf_model = cm.get_cfnet_compiled(n_users, n_items, n_factors, loss=loss_fn, lr=lr)
# cf_model = cm.get_cfnet_approximating_labels(n_users, n_items, n_factors)

cf_model = cm.training_pipeline(input_model=(cf_model, loss_fn), 
                                input_data=(R, T, U, L_train, L_test), 

                                # Should we combine R and T into a single matrix X? Set to True if so
                                is_cascade = True, # Set to True here to combine R and T into X 
                                lh = lh_knn, # supply the pre-computed, kNN-based estimated labels for T
                                
                                # SGD optimization parameters
                                test_size = test_size,
                                epochs = epochs, 
                                batch_size=batch_size, 

                                # CF hyperparameters
                                # n_factors=n_factors, # this is factored into model definition
                                alpha=alpha, 
                                conf_measure=conf_measure, 
                                # conf_type='Cn', # default sparse confidence matrix (Cn)
                                # target_type=target_type,
                                
                                policy_threshold=policy_threshold, 
                                fold_number=fold_number)
import cf_models as cm

n_users, n_items = X.shape

fold_number = 0
test_size = 0.1

policy_threshold = 'fmax'
conf_measure = 'brier'
n_factors = 100
alpha = 100

lr = 0.001 
batch_size = 64
epochs = 200

loss_fn = tf.keras.losses.BinaryCrossentropy() # Options: tf.keras.losses.BinaryCrossentropy(), tf.keras.losses.MeanSquaredError(), ...

# Configure `target_type` (Options: 'generic', 'rating', 'label')
# 1. Choose 'label' if the BCE loss is used (because the CF model in this case attempts to approximates the label encoded in 0 and 1)
# 2. Choose 'rating' if MSE is used (because the CF model in this case approximates the rating, which is a regression problem)
# 3. Choose 'generic' for customized loss function with potentially more complex labeling information where "y_true" is a matrix 
# 
# Note that you are unlikely need to configure `target_type` because cf_models module has a method that will determine this for you automatically
# target_type = 'label' # if we use BCE, then the model approximates the label

cf_model = cm.get_cfnet_compiled(n_users, n_items, n_factors, loss=loss_fn, lr=lr)
# cf_model = cm.get_cfnet_approximating_labels(n_users, n_items, n_factors)

cf_model = cm.training_pipeline(input_model=(cf_model, loss_fn), 
                                input_data=(R, T, U, L_train, L_test), 

                                # Should we combine R and T into a single matrix X? Set to True if so
                                is_cascade = True, # Set to True here to combine R and T into X 
                                lh = lh_knn, # supply the pre-computed, kNN-based estimated labels for T
                                
                                # SGD optimization parameters
                                test_size = test_size,
                                epochs = epochs, 
                                batch_size=batch_size, 

                                # CF hyperparameters
                                # n_factors=n_factors, # this is factored into model definition
                                alpha=alpha, 
                                conf_measure=conf_measure, 
                                # conf_type='Cn', # default sparse confidence matrix (Cn)
                                # target_type=target_type,
                                
                                policy_threshold=policy_threshold, 
                                fold_number=fold_number)

[merge] Merging 'L_train' and 'lh': len(L_train): 3750 || len(lh): 1250 => len(L): 5000
[merge] Merging 'R' and 'T': shape(R):(5, 3750) || shape(T): (5, 1250) => shape(X): (5, 5000)

(make_cn) Using WEIGHTED confidence matrix to approximate ratings ...
[info] Confidence matrix type: Cn, target data type: label
Epoch 1/200
352/352 [==============================] - 4s 8ms/step - loss: 2.7591 - val_loss: 3.3721
Epoch 2/200
352/352 [==============================] - 2s 7ms/step - loss: 2.2233 - val_loss: 1.9442
Epoch 3/200
352/352 [==============================] - 2s 7ms/step - loss: 2.1033 - val_loss: 2.1897
Epoch 4/200
352/352 [==============================] - 2s 7ms/step - loss: 1.1302 - val_loss: 1.1313
Epoch 5/200
352/352 [==============================] - 2s 7ms/step - loss: 0.7799 - val_loss: 1.0086
Epoch 6/200
352/352 [==============================] - 2s 6ms/step - loss: 0.6457 - val_loss: 0.9135
Epoch 7/200
352/352 [==============================] - 2s 7ms/step - loss: 0.5857 - val_loss: 0.8766
Epoch 8/200
352/352 [==============================] - 2s 7ms/step - loss: 0.5545 - val_loss: 0.8564
Epoch 9/200
352/352 [==============================] - 2s 7ms/step - loss: 0.5353 - val_loss: 0.8500
Epoch 10/200
352/352 [==============================] - 3s 7ms/step - loss: 0.5183 - val_loss: 0.8246
Epoch 11/200
352/352 [==============================] - 2s 7ms/step - loss: 0.5021 - val_loss: 0.8175
Epoch 12/200
352/352 [==============================] - 2s 7ms/step - loss: 0.4950 - val_loss: 0.7812
Epoch 13/200
352/352 [==============================] - 2s 6ms/step - loss: 0.4759 - val_loss: 0.7763
Epoch 14/200
352/352 [==============================] - 2s 7ms/step - loss: 0.4656 - val_loss: 0.7545
Epoch 15/200
352/352 [==============================] - 2s 6ms/step - loss: 0.4494 - val_loss: 0.7488
Epoch 16/200
352/352 [==============================] - 2s 7ms/step - loss: 0.4357 - val_loss: 0.7209
Epoch 17/200
352/352 [==============================] - 3s 7ms/step - loss: 0.4198 - val_loss: 0.7068
Epoch 18/200
352/352 [==============================] - 2s 7ms/step - loss: 0.4044 - val_loss: 0.6898
Epoch 19/200
352/352 [==============================] - 3s 7ms/step - loss: 0.3898 - val_loss: 0.6753
Epoch 20/200
352/352 [==============================] - 2s 7ms/step - loss: 0.3787 - val_loss: 0.6564
Epoch 21/200
352/352 [==============================] - 3s 7ms/step - loss: 0.3645 - val_loss: 0.6430
Epoch 22/200
352/352 [==============================] - 2s 7ms/step - loss: 0.3536 - val_loss: 0.6318
Epoch 23/200
352/352 [==============================] - 2s 7ms/step - loss: 0.3447 - val_loss: 0.6222
Epoch 24/200
352/352 [==============================] - 3s 7ms/step - loss: 0.3371 - val_loss: 0.6135
Epoch 25/200
352/352 [==============================] - 2s 7ms/step - loss: 0.3302 - val_loss: 0.6052
Epoch 26/200
352/352 [==============================] - 2s 7ms/step - loss: 0.3238 - val_loss: 0.5967
Epoch 27/200
352/352 [==============================] - 2s 7ms/step - loss: 0.3176 - val_loss: 0.5886
Epoch 28/200
352/352 [==============================] - 2s 6ms/step - loss: 0.3117 - val_loss: 0.5802
Epoch 29/200
352/352 [==============================] - 2s 7ms/step - loss: 0.3060 - val_loss: 0.5720
Epoch 30/200
352/352 [==============================] - 2s 7ms/step - loss: 0.3004 - val_loss: 0.5641
Epoch 31/200
352/352 [==============================] - 2s 7ms/step - loss: 0.2949 - val_loss: 0.5556
Epoch 32/200
352/352 [==============================] - 2s 7ms/step - loss: 0.2895 - val_loss: 0.5476
Epoch 33/200
352/352 [==============================] - 2s 6ms/step - loss: 0.2843 - val_loss: 0.5397
Epoch 34/200
352/352 [==============================] - 2s 6ms/step - loss: 0.2795 - val_loss: 0.5318
Epoch 35/200
352/352 [==============================] - 2s 7ms/step - loss: 0.2742 - val_loss: 0.5241
Epoch 36/200
352/352 [==============================] - 3s 8ms/step - loss: 0.2696 - val_loss: 0.5160
Epoch 37/200
352/352 [==============================] - 2s 7ms/step - loss: 0.2646 - val_loss: 0.5082
Epoch 38/200
352/352 [==============================] - 3s 7ms/step - loss: 0.2603 - val_loss: 0.5011
Epoch 39/200
352/352 [==============================] - 2s 7ms/step - loss: 0.2557 - val_loss: 0.4947
Epoch 40/200
352/352 [==============================] - 2s 6ms/step - loss: 0.2514 - val_loss: 0.4852
Epoch 41/200
352/352 [==============================] - 2s 7ms/step - loss: 0.2462 - val_loss: 0.4789
Epoch 42/200
352/352 [==============================] - 2s 6ms/step - loss: 0.2419 - val_loss: 0.4703
Epoch 43/200
352/352 [==============================] - 2s 7ms/step - loss: 0.2383 - val_loss: 0.4630
Epoch 44/200
352/352 [==============================] - 2s 7ms/step - loss: 0.2329 - val_loss: 0.4575
Epoch 45/200
352/352 [==============================] - 2s 6ms/step - loss: 0.2301 - val_loss: 0.4487
Epoch 46/200
352/352 [==============================] - 2s 7ms/step - loss: 0.2245 - val_loss: 0.4418
Epoch 47/200
352/352 [==============================] - 2s 7ms/step - loss: 0.2210 - val_loss: 0.4347
Epoch 48/200
352/352 [==============================] - 2s 7ms/step - loss: 0.2168 - val_loss: 0.4273
Epoch 49/200
352/352 [==============================] - 2s 7ms/step - loss: 0.2118 - val_loss: 0.4206
Epoch 50/200
352/352 [==============================] - 3s 7ms/step - loss: 0.2100 - val_loss: 0.4197
Epoch 51/200
352/352 [==============================] - 2s 7ms/step - loss: 0.2050 - val_loss: 0.4073
Epoch 52/200
352/352 [==============================] - 2s 7ms/step - loss: 0.2003 - val_loss: 0.4007
Epoch 53/200
352/352 [==============================] - 2s 7ms/step - loss: 0.1967 - val_loss: 0.3942
Epoch 54/200
352/352 [==============================] - 2s 7ms/step - loss: 0.1939 - val_loss: 0.3879
Epoch 55/200
352/352 [==============================] - 2s 7ms/step - loss: 0.1895 - val_loss: 0.3826
Epoch 56/200
352/352 [==============================] - 2s 7ms/step - loss: 0.1869 - val_loss: 0.3764
Epoch 57/200
352/352 [==============================] - 3s 7ms/step - loss: 0.1824 - val_loss: 0.3693
Epoch 58/200
352/352 [==============================] - 3s 7ms/step - loss: 0.1790 - val_loss: 0.3638
Epoch 59/200
352/352 [==============================] - 2s 7ms/step - loss: 0.1763 - val_loss: 0.3575
Epoch 60/200
352/352 [==============================] - 2s 7ms/step - loss: 0.1728 - val_loss: 0.3518
Epoch 61/200
352/352 [==============================] - 2s 7ms/step - loss: 0.1687 - val_loss: 0.3460
Epoch 62/200
352/352 [==============================] - 2s 7ms/step - loss: 0.1656 - val_loss: 0.3407
Epoch 63/200
352/352 [==============================] - 2s 7ms/step - loss: 0.1627 - val_loss: 0.3349
Epoch 64/200
352/352 [==============================] - 3s 7ms/step - loss: 0.1602 - val_loss: 0.3319
Epoch 65/200
352/352 [==============================] - 2s 7ms/step - loss: 0.1570 - val_loss: 0.3244
Epoch 66/200
352/352 [==============================] - 2s 7ms/step - loss: 0.1533 - val_loss: 0.3189
Epoch 67/200
352/352 [==============================] - 2s 7ms/step - loss: 0.1502 - val_loss: 0.3138
Epoch 68/200
352/352 [==============================] - 2s 7ms/step - loss: 0.1486 - val_loss: 0.3087
Epoch 69/200
352/352 [==============================] - 2s 7ms/step - loss: 0.1445 - val_loss: 0.3039
Epoch 70/200
352/352 [==============================] - 2s 7ms/step - loss: 0.1419 - val_loss: 0.2995
Epoch 71/200
352/352 [==============================] - 2s 6ms/step - loss: 0.1391 - val_loss: 0.2939
Epoch 72/200
352/352 [==============================] - 3s 7ms/step - loss: 0.1366 - val_loss: 0.2892
Epoch 73/200
352/352 [==============================] - 3s 7ms/step - loss: 0.1337 - val_loss: 0.2843
Epoch 74/200
352/352 [==============================] - 3s 8ms/step - loss: 0.1310 - val_loss: 0.2794
Epoch 75/200
352/352 [==============================] - 2s 6ms/step - loss: 0.1285 - val_loss: 0.2749
Epoch 76/200
352/352 [==============================] - 2s 6ms/step - loss: 0.1263 - val_loss: 0.2711
Epoch 77/200
352/352 [==============================] - 3s 7ms/step - loss: 0.1241 - val_loss: 0.2661
Epoch 78/200
352/352 [==============================] - 3s 7ms/step - loss: 0.1211 - val_loss: 0.2617
Epoch 79/200
352/352 [==============================] - 3s 7ms/step - loss: 0.1187 - val_loss: 0.2578
Epoch 80/200
352/352 [==============================] - 3s 9ms/step - loss: 0.1164 - val_loss: 0.2536
Epoch 81/200
352/352 [==============================] - 2s 6ms/step - loss: 0.1145 - val_loss: 0.2523
Epoch 82/200
352/352 [==============================] - 2s 6ms/step - loss: 0.1123 - val_loss: 0.2452
Epoch 83/200
352/352 [==============================] - 3s 8ms/step - loss: 0.1095 - val_loss: 0.2412
Epoch 84/200
352/352 [==============================] - 2s 7ms/step - loss: 0.1074 - val_loss: 0.2372
Epoch 85/200
352/352 [==============================] - 3s 8ms/step - loss: 0.1053 - val_loss: 0.2332
Epoch 86/200
352/352 [==============================] - 3s 7ms/step - loss: 0.1033 - val_loss: 0.2302
Epoch 87/200
352/352 [==============================] - 3s 7ms/step - loss: 0.1013 - val_loss: 0.2259
Epoch 88/200
352/352 [==============================] - 2s 7ms/step - loss: 0.0993 - val_loss: 0.2222
Epoch 89/200
352/352 [==============================] - 2s 7ms/step - loss: 0.0975 - val_loss: 0.2187
Epoch 90/200
352/352 [==============================] - 3s 8ms/step - loss: 0.0952 - val_loss: 0.2147
Epoch 91/200
352/352 [==============================] - 2s 6ms/step - loss: 0.0933 - val_loss: 0.2113
Epoch 92/200
352/352 [==============================] - 2s 7ms/step - loss: 0.0915 - val_loss: 0.2077
Epoch 93/200
352/352 [==============================] - 3s 7ms/step - loss: 0.0896 - val_loss: 0.2043
Epoch 94/200
352/352 [==============================] - 2s 6ms/step - loss: 0.0879 - val_loss: 0.2011
Epoch 95/200
352/352 [==============================] - 3s 7ms/step - loss: 0.0862 - val_loss: 0.1979
Epoch 96/200
352/352 [==============================] - 2s 7ms/step - loss: 0.0845 - val_loss: 0.1946
Epoch 97/200
352/352 [==============================] - 3s 7ms/step - loss: 0.0828 - val_loss: 0.1913
Epoch 98/200
352/352 [==============================] - 3s 7ms/step - loss: 0.0810 - val_loss: 0.1881
Epoch 99/200
352/352 [==============================] - 3s 7ms/step - loss: 0.0793 - val_loss: 0.1850
Epoch 100/200
352/352 [==============================] - 3s 7ms/step - loss: 0.0778 - val_loss: 0.1822
Epoch 101/200
352/352 [==============================] - 2s 7ms/step - loss: 0.0764 - val_loss: 0.1794
Epoch 102/200
352/352 [==============================] - 2s 7ms/step - loss: 0.0747 - val_loss: 0.1763
Epoch 103/200
352/352 [==============================] - 3s 7ms/step - loss: 0.0732 - val_loss: 0.1735
Epoch 104/200
352/352 [==============================] - 3s 7ms/step - loss: 0.0717 - val_loss: 0.1707
Epoch 105/200
352/352 [==============================] - 3s 7ms/step - loss: 0.0703 - val_loss: 0.1680
Epoch 106/200
352/352 [==============================] - 3s 8ms/step - loss: 0.0689 - val_loss: 0.1653
Epoch 107/200
352/352 [==============================] - 2s 7ms/step - loss: 0.0675 - val_loss: 0.1625
Epoch 108/200
352/352 [==============================] - 3s 8ms/step - loss: 0.0660 - val_loss: 0.1600
Epoch 109/200
352/352 [==============================] - 2s 7ms/step - loss: 0.0647 - val_loss: 0.1572
Epoch 110/200
352/352 [==============================] - 3s 7ms/step - loss: 0.0635 - val_loss: 0.1547
Epoch 111/200
352/352 [==============================] - 3s 8ms/step - loss: 0.0621 - val_loss: 0.1522
Epoch 112/200
352/352 [==============================] - 3s 7ms/step - loss: 0.0609 - val_loss: 0.1497
Epoch 113/200
352/352 [==============================] - 2s 7ms/step - loss: 0.0597 - val_loss: 0.1475
Epoch 114/200
352/352 [==============================] - 2s 7ms/step - loss: 0.0584 - val_loss: 0.1449
Epoch 115/200
352/352 [==============================] - 3s 7ms/step - loss: 0.0572 - val_loss: 0.1426
Epoch 116/200
352/352 [==============================] - 3s 7ms/step - loss: 0.0560 - val_loss: 0.1403
Epoch 117/200
352/352 [==============================] - 3s 7ms/step - loss: 0.0549 - val_loss: 0.1384
Epoch 118/200
352/352 [==============================] - 2s 7ms/step - loss: 0.0538 - val_loss: 0.1359
Epoch 119/200
352/352 [==============================] - 2s 7ms/step - loss: 0.0527 - val_loss: 0.1337
Epoch 120/200
352/352 [==============================] - 2s 7ms/step - loss: 0.0516 - val_loss: 0.1316
Epoch 121/200
352/352 [==============================] - 3s 7ms/step - loss: 0.0506 - val_loss: 0.1297
Epoch 122/200
352/352 [==============================] - 3s 7ms/step - loss: 0.0495 - val_loss: 0.1275
Epoch 123/200
352/352 [==============================] - 3s 8ms/step - loss: 0.0485 - val_loss: 0.1255
Epoch 124/200
352/352 [==============================] - 3s 7ms/step - loss: 0.0475 - val_loss: 0.1237
Epoch 125/200
352/352 [==============================] - 3s 7ms/step - loss: 0.0466 - val_loss: 0.1216
Epoch 126/200
352/352 [==============================] - 2s 7ms/step - loss: 0.0456 - val_loss: 0.1197
Epoch 127/200
352/352 [==============================] - 2s 7ms/step - loss: 0.0446 - val_loss: 0.1178
Epoch 128/200
352/352 [==============================] - 3s 7ms/step - loss: 0.0437 - val_loss: 0.1160
Epoch 129/200
352/352 [==============================] - 3s 7ms/step - loss: 0.0429 - val_loss: 0.1142
Epoch 130/200
352/352 [==============================] - 3s 7ms/step - loss: 0.0419 - val_loss: 0.1125
Epoch 131/200
352/352 [==============================] - 3s 7ms/step - loss: 0.0411 - val_loss: 0.1108
Epoch 132/200
352/352 [==============================] - 2s 7ms/step - loss: 0.0403 - val_loss: 0.1088
Epoch 133/200
352/352 [==============================] - 2s 7ms/step - loss: 0.0394 - val_loss: 0.1074
Epoch 134/200
352/352 [==============================] - 2s 7ms/step - loss: 0.0386 - val_loss: 0.1058
Epoch 135/200
352/352 [==============================] - 2s 7ms/step - loss: 0.0378 - val_loss: 0.1041
Epoch 136/200
352/352 [==============================] - 3s 7ms/step - loss: 0.0371 - val_loss: 0.1025
Epoch 137/200
352/352 [==============================] - 3s 7ms/step - loss: 0.0363 - val_loss: 0.1010
Epoch 138/200
352/352 [==============================] - 3s 7ms/step - loss: 0.0355 - val_loss: 0.0993
Epoch 139/200
352/352 [==============================] - 3s 7ms/step - loss: 0.0348 - val_loss: 0.0978
Epoch 140/200
352/352 [==============================] - 2s 7ms/step - loss: 0.0341 - val_loss: 0.0964
Epoch 141/200
352/352 [==============================] - 3s 7ms/step - loss: 0.0334 - val_loss: 0.0949
Epoch 142/200
352/352 [==============================] - 2s 7ms/step - loss: 0.0327 - val_loss: 0.0936
Epoch 143/200
352/352 [==============================] - 2s 6ms/step - loss: 0.0320 - val_loss: 0.0921
Epoch 144/200
352/352 [==============================] - 2s 7ms/step - loss: 0.0314 - val_loss: 0.0908
Epoch 145/200
352/352 [==============================] - 2s 7ms/step - loss: 0.0307 - val_loss: 0.0895
Epoch 146/200
352/352 [==============================] - 3s 7ms/step - loss: 0.0301 - val_loss: 0.0881
Epoch 147/200
352/352 [==============================] - 3s 7ms/step - loss: 0.0295 - val_loss: 0.0868
Epoch 148/200
352/352 [==============================] - 2s 6ms/step - loss: 0.0289 - val_loss: 0.0856
Epoch 149/200
352/352 [==============================] - 3s 8ms/step - loss: 0.0283 - val_loss: 0.0842
Epoch 150/200
352/352 [==============================] - 3s 7ms/step - loss: 0.0277 - val_loss: 0.0831
Epoch 151/200
352/352 [==============================] - 2s 7ms/step - loss: 0.0271 - val_loss: 0.0818
Epoch 152/200
352/352 [==============================] - 2s 7ms/step - loss: 0.0265 - val_loss: 0.0806
Epoch 153/200
352/352 [==============================] - 2s 6ms/step - loss: 0.0260 - val_loss: 0.0795
Epoch 154/200
352/352 [==============================] - 2s 7ms/step - loss: 0.0255 - val_loss: 0.0783
Epoch 155/200
352/352 [==============================] - 3s 7ms/step - loss: 0.0249 - val_loss: 0.0772
Epoch 156/200
352/352 [==============================] - 3s 8ms/step - loss: 0.0244 - val_loss: 0.0762
Epoch 157/200
352/352 [==============================] - 2s 7ms/step - loss: 0.0239 - val_loss: 0.0749
Epoch 158/200
352/352 [==============================] - 3s 7ms/step - loss: 0.0234 - val_loss: 0.0740
Epoch 159/200
352/352 [==============================] - 2s 7ms/step - loss: 0.0230 - val_loss: 0.0730
Epoch 160/200
352/352 [==============================] - 3s 7ms/step - loss: 0.0225 - val_loss: 0.0719
Epoch 161/200
352/352 [==============================] - 2s 7ms/step - loss: 0.0220 - val_loss: 0.0709
Epoch 162/200
352/352 [==============================] - 3s 7ms/step - loss: 0.0216 - val_loss: 0.0699
Epoch 163/200
352/352 [==============================] - 2s 6ms/step - loss: 0.0211 - val_loss: 0.0689
Epoch 164/200
352/352 [==============================] - 3s 7ms/step - loss: 0.0207 - val_loss: 0.0680
Epoch 165/200
352/352 [==============================] - 3s 7ms/step - loss: 0.0202 - val_loss: 0.0671
Epoch 166/200
352/352 [==============================] - 3s 7ms/step - loss: 0.0198 - val_loss: 0.0662
Epoch 167/200
352/352 [==============================] - 2s 7ms/step - loss: 0.0194 - val_loss: 0.0652
Epoch 168/200
352/352 [==============================] - 3s 7ms/step - loss: 0.0190 - val_loss: 0.0644
Epoch 169/200
352/352 [==============================] - 2s 7ms/step - loss: 0.0186 - val_loss: 0.0635
Epoch 170/200
352/352 [==============================] - 2s 7ms/step - loss: 0.0182 - val_loss: 0.0627
Epoch 171/200
352/352 [==============================] - 2s 7ms/step - loss: 0.0179 - val_loss: 0.0618
Epoch 172/200
352/352 [==============================] - 2s 6ms/step - loss: 0.0175 - val_loss: 0.0610
Epoch 173/200
352/352 [==============================] - 2s 7ms/step - loss: 0.0171 - val_loss: 0.0602
Epoch 174/200
352/352 [==============================] - 2s 7ms/step - loss: 0.0168 - val_loss: 0.0594
Epoch 175/200
352/352 [==============================] - 2s 6ms/step - loss: 0.0164 - val_loss: 0.0586
Epoch 176/200
352/352 [==============================] - 2s 7ms/step - loss: 0.0161 - val_loss: 0.0579
Epoch 177/200
352/352 [==============================] - 3s 7ms/step - loss: 0.0158 - val_loss: 0.0571
Epoch 178/200
352/352 [==============================] - 2s 6ms/step - loss: 0.0154 - val_loss: 0.0563
Epoch 179/200
352/352 [==============================] - 2s 7ms/step - loss: 0.0151 - val_loss: 0.0556
Epoch 180/200
352/352 [==============================] - 3s 7ms/step - loss: 0.0148 - val_loss: 0.0549
Epoch 181/200
352/352 [==============================] - 3s 7ms/step - loss: 0.0145 - val_loss: 0.0541
Epoch 182/200
352/352 [==============================] - 2s 7ms/step - loss: 0.0142 - val_loss: 0.0534
Epoch 183/200
352/352 [==============================] - 3s 7ms/step - loss: 0.0139 - val_loss: 0.0528
Epoch 184/200
352/352 [==============================] - 2s 7ms/step - loss: 0.0136 - val_loss: 0.0522
Epoch 185/200
352/352 [==============================] - 2s 7ms/step - loss: 0.0133 - val_loss: 0.0515
Epoch 186/200
352/352 [==============================] - 2s 7ms/step - loss: 0.0131 - val_loss: 0.0508
Epoch 187/200
352/352 [==============================] - 3s 7ms/step - loss: 0.0128 - val_loss: 0.0502
Epoch 188/200
352/352 [==============================] - 2s 7ms/step - loss: 0.0125 - val_loss: 0.0495
Epoch 189/200
352/352 [==============================] - 3s 7ms/step - loss: 0.0123 - val_loss: 0.0490
Epoch 190/200
352/352 [==============================] - 3s 7ms/step - loss: 0.0120 - val_loss: 0.0484
Epoch 191/200
352/352 [==============================] - 2s 7ms/step - loss: 0.0118 - val_loss: 0.0478
Epoch 192/200
352/352 [==============================] - 3s 7ms/step - loss: 0.0115 - val_loss: 0.0472
Epoch 193/200
352/352 [==============================] - 3s 7ms/step - loss: 0.0113 - val_loss: 0.0467
Epoch 194/200
352/352 [==============================] - 3s 7ms/step - loss: 0.0111 - val_loss: 0.0461
Epoch 195/200
352/352 [==============================] - 3s 7ms/step - loss: 0.0108 - val_loss: 0.0456
Epoch 196/200
352/352 [==============================] - 3s 8ms/step - loss: 0.0106 - val_loss: 0.0450
Epoch 197/200
352/352 [==============================] - 2s 7ms/step - loss: 0.0104 - val_loss: 0.0445
Epoch 198/200
352/352 [==============================] - 2s 6ms/step - loss: 0.0102 - val_loss: 0.0441
Epoch 199/200
352/352 [==============================] - 3s 7ms/step - loss: 0.0100 - val_loss: 0.0436
Epoch 200/200
352/352 [==============================] - 3s 7ms/step - loss: 0.0098 - val_loss: 0.0430

In [ ]:

Copied!





analyzer = cm.analyze_reconstruction(cf_model, X, L, Pc, n_train, p_threshold=p_threshold, policy_threshold=policy_threshold)
highlight("(BCE) Reestimate the entire rating matrix (X) with learned latent factors/embeddings")
reestimated = analyzer(L_test, unreliable_only=False)
highlight("(BCE) Reestimate ONLY the unreliable entries in X with learned latent factors/embeddings")
reestimated = analyzer(L_test, unreliable_only=True)
analyzer = cm.analyze_reconstruction(cf_model, X, L, Pc, n_train, p_threshold=p_threshold, policy_threshold=policy_threshold)
highlight("(BCE) Reestimate the entire rating matrix (X) with learned latent factors/embeddings")
reestimated = analyzer(L_test, unreliable_only=False)
highlight("(BCE) Reestimate ONLY the unreliable entries in X with learned latent factors/embeddings")
reestimated = analyzer(L_test, unreliable_only=True)

================================================================================
(BCE) Reestimate the entire rating matrix (X) with learned latent factors/embeddings
================================================================================
[info] From R to Rh, delta(Frobenius norm)= 74.72180508554011
[info] From T to Th, delta(Frobenius norm)= 38.623946721693805
[info] How different are lh and lh_new? 0.4576
[result] Majority vote: F1 score with the original T:  0.20470262793914248
[result] Majority vote: F1 score with re-estimated Th using original p_threshold: 0.19364161849710984
[result] Majority vote: F1 score with re-estimated Th: 0.18543046357615894

[result] Stacking: F1 score with the original T:  0.125
[result] Stacking: F1 score with re-estimated Th: 0.1842105263157895

[result] Best settings (complete): lh_maxvote, score: 0.20470262793914248

================================================================================
(BCE) Reestimate ONLY the unreliable entries in X with learned latent factors/embeddings
================================================================================
[info] From R to Rh, delta(Frobenius norm)= 68.85513758278863
[info] From T to Th, delta(Frobenius norm)= 34.85557728925823
[info] How different are lh and lh_new? 0.4584
[result] Majority vote: F1 score with the original T:  0.20470262793914248
[result] Majority vote: F1 score with re-estimated Th using original p_threshold: 0.2
[result] Majority vote: F1 score with re-estimated Th: 0.1842105263157895

[result] Stacking: F1 score with the original T:  0.125
[result] Stacking: F1 score with re-estimated Th: 0.1842105263157895

[result] Best settings (unreliable only): lh_maxvote, score: 0.20470262793914248

In [ ]: