In [3]:

```
import numpy as np
import matplotlib.pyplot as plt
import scipy.sparse
data, labels, class_names, vocabulary = np.load("ReutersNews_4Classes_sparse.npy", allow_pickle=True)
```

In [2]:

```
print(data[41,:]) # A sparse row vector; the output will be the non-zero indices and their values.
print(data[41,:].toarray()) # Convert back to a NumPy array. Note that the result is a (1, 6428) matrix, not a vector.
# print(vocabulary[data[41,:] > 0]) # Can't index vocabulary with a sparse matrix.
rows, columns, values = scipy.sparse.find(data[41,:]) # Find the non-zero entries in the 42nd document.
print(vocabulary[columns]) # Prints the words present in the 42nd document.
```

In [3]:

`print(", ".join(vocabulary))`

In [4]:

```
i, j = 40, 2
print(data[i,j])
```

In [5]:

```
print(labels[i])
```

In [6]:

```
print("Occurrences:", data[0,10])
print("Class:", class_names[labels[0]])
print("Word:", vocabulary[10])
```

In [7]:

```
def sample_indices(labels, *num_per_class):
"""
Returns randomly selected indices. It will return the specified number of indices for each class.
"""
indices = []
for cls, num in enumerate(num_per_class):
cls_indices = np.where(labels == cls)[0]
indices.extend(np.random.choice(cls_indices, size=num, replace=False))
return np.array(indices)
```

In [8]:

```
indices = sample_indices(labels, 1, 2, 3, 4)
print("Returned indices:", indices)
print("Samples:", data[indices])
print("Corresponding classes:", labels[indices])
```

In [9]:

```
import scipy.stats
def knn_classify(test_samples, training_data, training_labels, metric="euclidean", k=1):
"""
Performs k-nearest neighbour classification on the provided samples,
given training data and the corresponding labels.
test_samples: An m x d matrix of m samples to classify, each with d features.
training_data: An n x d matrix consisting of n training samples, each with d features.
training_labels: A vector of size n, where training_labels[i] is the label of training_data[i].
metric: The metric to use for calculating distances between samples.
k: The number of nearest neighbours to use for classification.
Returns: A vector of size m, where out[i] is the predicted class of test_samples[i].
"""
# Calculate an m x n distance matrix.
pairwise_distance = ...
# Find the k nearest neighbours of each samples as an m x k matrix of indices.
nearest_neighbours = ...
# Look up the classes corresponding to each index.
nearest_labels = ...
# Return the most frequent class on each row.
# Note: Ensure that the returned vector does not contain any empty dimensions.
# You may find the squeeze method useful here.
return ...
```

In [11]:

```
# Your code goes here
```

In [14]:

```
# Your code goes here
```

In [16]:

```
# Your code goes here
```

In [18]:

```
# Your code goes here
```

In [ ]:

```
# Your code goes here
```

In [21]:

```
# Your code goes here
```

In [5]:

```
# Make sure you have scikit-learn installed.
from sklearn.feature_extraction.text import CountVectorizer
articles = []
for f in [sp0, sp1, sp2, sp3, sp4]:
text = f.replace('\n', ' ')
articles.append(text)
vrizer = CountVectorizer(vocabulary=vocabulary)
new_data = vrizer.fit_transform(articles)
```

In [ ]:

```
# Your code goes here
```

In [29]:

```
data_augmented = scipy.sparse.vstack((data, new_data))
labels_augmented = ... # your code goes here
```

In [8]:

```
# You may write your calculations in LateX or in code here
```

In [34]:

```
# run this cell first
def Get_p_value(zp):
return round(1 - scipy.stats.norm.sf(abs(zp))*2,2)
```

In [35]:

```
# Use this cell to compare the output value of function Get_p_value with
# the table provided in your lecture notes (e.g., Slide 12, Chapter3C.pdf)
print('zp = 0.67, p = ', Get_p_value(0.67))
print('zp = 1, p = ', Get_p_value(1))
print('zp = 1.64, p = ', Get_p_value(1.64))
print('zp = 2.58, p = ', Get_p_value(2.58))
print()
# you can alert the input zp value and re-run this cell to help you to calculate the corresponding p.
print('p = ', Get_p_value(0.43))
# you can change 0.43 to any zp value you obtained.
```