COMP24112 Machine Learning Lab 2: News Article Classification by k-NN
In [3]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.sparse
data, labels, class_names, vocabulary = np.load("ReutersNews_4Classes_sparse.npy", allow_pickle=True)
In [2]:
print(data[41,:]) # A sparse row vector; the output will be the non-zero indices and their values.
print(data[41,:].toarray()) # Convert back to a NumPy array. Note that the result is a (1, 6428) matrix, not a vector.
# print(vocabulary[data[41,:] > 0]) # Can't index vocabulary with a sparse matrix.
rows, columns, values = scipy.sparse.find(data[41,:]) # Find the non-zero entries in the 42nd document.
print(vocabulary[columns]) # Prints the words present in the 42nd document.
In [3]:
print(", ".join(vocabulary))
In [4]:
i, j = 40, 2
print(data[i,j])
In [5]:
print(labels[i])
In [6]:
print("Occurrences:", data[0,10])
print("Class:", class_names[labels[0]])
print("Word:", vocabulary[10])
In [7]:
def sample_indices(labels, *num_per_class):
"""
Returns randomly selected indices. It will return the specified number of indices for each class.
"""
indices = []
for cls, num in enumerate(num_per_class):
cls_indices = np.where(labels == cls)[0]
indices.extend(np.random.choice(cls_indices, size=num, replace=False))
return np.array(indices)
In [8]:
indices = sample_indices(labels, 1, 2, 3, 4)
print("Returned indices:", indices)
print("Samples:", data[indices])
print("Corresponding classes:", labels[indices])
In [9]:
import scipy.stats
def knn_classify(test_samples, training_data, training_labels, metric="euclidean", k=1):
"""
Performs k-nearest neighbour classification on the provided samples,
given training data and the corresponding labels.
test_samples: An m x d matrix of m samples to classify, each with d features.
training_data: An n x d matrix consisting of n training samples, each with d features.
training_labels: A vector of size n, where training_labels[i] is the label of training_data[i].
metric: The metric to use for calculating distances between samples.
k: The number of nearest neighbours to use for classification.
Returns: A vector of size m, where out[i] is the predicted class of test_samples[i].
"""
# Calculate an m x n distance matrix.
pairwise_distance = ...
# Find the k nearest neighbours of each samples as an m x k matrix of indices.
nearest_neighbours = ...
# Look up the classes corresponding to each index.
nearest_labels = ...
# Return the most frequent class on each row.
# Note: Ensure that the returned vector does not contain any empty dimensions.
# You may find the squeeze method useful here.
return ...
In [11]:
# Your code goes here
In [14]:
# Your code goes here
In [16]:
# Your code goes here
In [18]:
# Your code goes here
In [ ]:
# Your code goes here
In [21]:
# Your code goes here
In [5]:
# Make sure you have scikit-learn installed.
from sklearn.feature_extraction.text import CountVectorizer
articles = []
for f in [sp0, sp1, sp2, sp3, sp4]:
text = f.replace('\n', ' ')
articles.append(text)
vrizer = CountVectorizer(vocabulary=vocabulary)
new_data = vrizer.fit_transform(articles)
In [ ]:
# Your code goes here
In [29]:
data_augmented = scipy.sparse.vstack((data, new_data))
labels_augmented = ... # your code goes here
In [8]:
# You may write your calculations in LateX or in code here
In [34]:
# run this cell first
def Get_p_value(zp):
return round(1 - scipy.stats.norm.sf(abs(zp))*2,2)
In [35]:
# Use this cell to compare the output value of function Get_p_value with
# the table provided in your lecture notes (e.g., Slide 12, Chapter3C.pdf)
print('zp = 0.67, p = ', Get_p_value(0.67))
print('zp = 1, p = ', Get_p_value(1))
print('zp = 1.64, p = ', Get_p_value(1.64))
print('zp = 2.58, p = ', Get_p_value(2.58))
print()
# you can alert the input zp value and re-run this cell to help you to calculate the corresponding p.
print('p = ', Get_p_value(0.43))
# you can change 0.43 to any zp value you obtained.