### Data Example
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
import pylab
import numpy as np

X, y = make_moons(n_samples=5000, random_state=42, noise=0.1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42,test_size=0.3)

pylab.scatter(X[:,0], X[:,1], c=y)
pylab.xlabel("$x_1$",size=22)
pylab.ylabel("$x_2$",size=22)
pylab.show()

### Plot sigmoid function

import numpy as np
import matplotlib.pyplot as plt
 
# Sigmoid function
#
def sigmoid(a):
    return 1 / (1 + np.exp(-a))
# Creating sample f points
#
f = np.arange(-5, 5, 0.1)
 
# Invoking Sigmoid function on all Z points
#
sigma_f = sigmoid(f)
 
# Plotting the Sigmoid function
#
plt.plot(f, sigma_f)
plt.axvline(0.0, color='k')
plt.xlabel('$f(x)$',size=24)
plt.ylabel('$\sigma(f(x))$',size=24)
plt.yticks([0.0, 0.5, 1.0],size=24)
plt.xticks([-4, -2, 0 , 2, 4],size=24)
ax = plt.gca()
ax.yaxis.grid(True)
plt.tight_layout()
plt.show()

import numpy as np
import pylab

# We initialize weights and bias here with arbitrary values for a decision boundary.
w = np.array([1, -1])  # Weight vector w
b = 0.5  # Bias term
x1_vals = np.linspace(-1.5, 2.5, 100)  # x1 values range from -1.5 to 2.5
x2_vals = -(w[0] * x1_vals + b) / w[1]  # Compute x2 values using the decision boundary equation

# Scatter plot of the data points with class labels.
pylab.scatter(X[:, 0], X[:, 1], c=y)  # Plot data points with color indicating class (y)

# Plot the decision boundary line (x2 vs x1 based on the linear equation).
pylab.plot(x1_vals, x2_vals, label="Decision Boundary", color='red')

# Add the weight vector (w) as an arrow starting from the origin (0, 0).
origin = np.array([0, 0])  # Define origin at (0, 0)
pylab.quiver(*origin, *w, scale=5, color='purple')  # Draw an arrow representing vector w

# Add a text label "w" near the arrow to identify the weight vector.
pylab.text(w[0] - 0.5, w[1], 'w', color='purple', fontsize=15, ha='center', va='center')  # Label the vector

# Add vertical and horizontal black lines to mark the axes (thin lines).
pylab.axhline(y=0, color='black', linewidth=0.5)  # Draw horizontal x-axis at y=0
pylab.axvline(x=0, color='black', linewidth=0.5)  # Draw vertical y-axis at x=0

# Set y-axis limit so that it extends to -1.5.
pylab.ylim(-1.5, None)  # Set the lower limit of the y-axis to -1.5, leaving the upper limit unchanged

# Define and plot specific points relative to the decision boundary.
point_on_boundary = np.array([0, -b / w[1]])  # A point exactly on the decision boundary
point_near_boundary = np.array([0, (-b / w[1]) + 0.5])  # A point slightly above the decision boundary
point_far_from_boundary = np.array([1, -1])  # A point farther away from the decision boundary

# Scatter plot these specific points with distinct colors for visualization.
pylab.scatter(point_on_boundary[0], point_on_boundary[1], color='green', label='On boundary', s=100)
pylab.scatter(point_near_boundary[0], point_near_boundary[1], color='lightblue', label='Near boundary', s=100)
pylab.scatter(point_far_from_boundary[0], point_far_from_boundary[1], color='orange', label='Far from boundary', s=100)

# Set labels for x and y axes.
pylab.xlabel("$x_1$", size=22)  # Label for x-axis (x1)
pylab.ylabel("$x_2$", size=22)  # Label for y-axis (x2)

# Add a legend to explain the plotted points and decision boundary.
pylab.legend()

# Display the plot.
pylab.show()

# Define the sigmoid function for probability calculation based on raw output z.
def sigmoid(z):
    return 1 / (1 + np.exp(-z))  # Sigmoid activation function

# Loop through each specific point to calculate its raw model output (z) and the probability of being in class 1.
for point in [point_on_boundary, point_near_boundary, point_far_from_boundary]:
    z = np.dot(w, point) + b  # Calculate the linear combination of inputs and weights (z = w · x + b)
    prob = sigmoid(z)  # Apply sigmoid to get the probability
    print(f"Point {point}:")
    print(f"    z = {z:.4f} (this is the raw model output before applying sigmoid)")
    print(f"    Probability of being in class 1: {prob:.4f}\n")  # Display the probability for class 1

Point [0.  0.5]:
    z = 0.0000 (this is the raw model output before applying sigmoid)
    Probability of being in class 1: 0.5000

Point [0. 1.]:
    z = -0.5000 (this is the raw model output before applying sigmoid)
    Probability of being in class 1: 0.3775

Point [ 1 -1]:
    z = 2.5000 (this is the raw model output before applying sigmoid)
    Probability of being in class 1: 0.9241

# There are only two features in the data X[:,0] and X[:,1]
n_features = 2
# There are only two classes: 0 (purple points) and 1 (yellow points)
n_class = 2

def init_weights(n_features):
    # Initialize weights with Standard Normal random variables
    model = dict(
        w=np.random.randn(n_features + 1, 1),
    )
    return model

model = init_weights(n_features=n_features)

print(model)

{'w': array([[-0.00129096],
       [ 1.04820381],
       [-0.26972077]])}

# Defines sigmoid function
def sigmoid(a):
    return (1 / (1 + np.exp(-a)))

# For a single example $x$
def logistic_regression(x, model):
    x = np.append(x, 1)
    
    # Input times weight vector
    f_x = x @ model['w']

    # Sigmoid activation
    y_hat = sigmoid(f_x)

    return y_hat

def logistic_derivative(model, xs, errors):
    """xs, errs contain all information (input, error) of all the data"""
    # gradient of the negative log-likelihood with respect to model parameters
    dw = xs.T @ errors

    return dict(w=dw)

# Get gradient of all parameters

def get_gradient(model, X_train, y_train):
    xs, errors = [], []

    for x, cls_idx in zip(X_train, y_train):
        y_pred = logistic_regression(x, model)

        # Create one-hot coding of true label
        y_true = np.zeros(n_class)
        y_true[int(cls_idx)] = 1.

        error = y_pred - y_true[1]

        # Accumulate the informations of the examples
        # x: all inputs
        # error: all errors
        x = np.append(x, 1)
        xs.append(x)
        errors.append(error)

    # Get gradients
    return logistic_derivative(model, np.array(xs), np.array(errors))

def one_gradient_step(model, X_train, y_train, learning_rate=1e-2):
    grad = get_gradient(model, X_train, y_train)

    # Update all parameters in the parameter dictionary
    for param in grad:
        # Learning rate: default 1e-2
        model[param] -= learning_rate * grad[param]

    return model

def gradient_descent(model, X_train, y_train, no_iter=10):
    for epoch in range(no_iter):
        
        if epoch % 10 == 0:
            print(f'Epoch {epoch}')

        model = one_gradient_step(model, X_train, y_train)

    print(f'Epoch {epoch}')
    return model

no_iter = 100

# Reset model
model = init_weights(n_features)

# Search for best parameters to minimize score function
model = gradient_descent(model, X_train, y_train, no_iter=no_iter)

Epoch 0
Epoch 10
Epoch 20
Epoch 30
Epoch 40
Epoch 50
Epoch 60
Epoch 70
Epoch 80
Epoch 90
Epoch 99

y_pred = np.zeros_like(y_test)

accuracy = 0

for i, x in enumerate(X_test):
    # Predict the distribution of label
    y_hat = logistic_regression(x, model)
    # Get label by picking the most probable class
    y = (y_hat[0] > 0.5).astype(int)
    y_pred[i] = y

# Accuracy of predictions with the true labels and take the percentage
# Because our dataset is balanced, measuring just the accuracy is OK
accuracy = (y_pred == y_test).sum() / y_test.size
print('Test accuracy after {} gradient steps: {}'.format(no_iter,accuracy))

pylab.scatter(X_test[:,0], X_test[:,1], c=y_pred)
pylab.show()

Test accuracy after 100 gradient steps: 0.8553333333333333

# There are only two features in the data X[:,0] and X[:,1]
n_features = 2
# There are only two classes: 0 (purple points) and 1 (yellow points)
n_class = 2

def multiclass_init_weights(n_features):
    # Initialize weights with Standard Normal random variables
    model = dict(
        W=np.random.randn(n_features + 1, n_class),
    )
    return model

model = multiclass_init_weights(n_features=n_features)

print(model)

{'W': array([[ 1.25320278, -0.67771525],
       [ 0.38927771,  0.86286996],
       [-0.23545144, -1.19446593]])}

# Defines the softmax function.
def softmax(x):
    return np.exp(x) / np.exp(x).sum()


# For a single example $x$
def multiclass_logistic_regression(x, model):
    x = np.append(x, 1)
    
    # Input times weight vector
    f_x = x @ model['W']

    # Softmax activation
    y_hat = softmax(f_x)

    return y_hat

def multiclass_logistic_derivative(model, xs, errors):
    """xs, errs contain all information (input, error) of all the data"""
    # gradient of the negative log-likelihood with respect to model parameters
    dW = xs.T @ errors

    return dict(W=dW)

# Get gradient of all parameters

def multiclass_get_gradient(model, X_train, y_train):
    xs, errors = [], []

    for x, cls_idx in zip(X_train, y_train):
        y_pred = multiclass_logistic_regression(x, model)

        # Create one-hot coding of true label
        y_true = np.zeros(n_class)
        y_true[int(cls_idx)] = 1.

        error = y_pred - y_true

        # Accumulate the informations of the examples
        # x: all inputs
        # error: all errors
        x = np.append(x, 1)
        xs.append(x)
        errors.append(error)

    # Get gradients
    return multiclass_logistic_derivative(model, np.array(xs), np.array(errors))

def multiclass_one_gradient_step(model, X_train, y_train, learning_rate=1e-2):
    grad = multiclass_get_gradient(model, X_train, y_train)

    # Update all parameters in the parameter dictionary
    for param in grad:
        # Learning rate: default 1e-2
        model[param] -= learning_rate * grad[param]

    return model

def multiclass_gradient_descent(model, X_train, y_train, no_iter=10):
    for epoch in range(no_iter):
        
        if epoch % 10 == 0:
            print(f'Epoch {epoch}')

        model = multiclass_one_gradient_step(model, X_train, y_train)

    print(f'Epoch {epoch}')
    return model

no_iter = 100

# Reset model
model = multiclass_init_weights(n_features)

# Train the model
model = multiclass_gradient_descent(model, X_train, y_train, no_iter=no_iter)

Epoch 0
Epoch 10
Epoch 20
Epoch 30
Epoch 40
Epoch 50
Epoch 60
Epoch 70
Epoch 80
Epoch 90
Epoch 99

y_pred = np.zeros_like(y_test)

accuracy = 0

for i, x in enumerate(X_test):
    # Predict the distribution of label
    y_hat = multiclass_logistic_regression(x, model)
    # Get label by picking the most probable one
    y = np.argmax(y_hat)
    y_pred[i] = y

# Accuracy of predictions with the true labels and take the percentage
# Because our dataset is balanced, measuring just the accuracy is OK
accuracy = (y_pred == y_test).sum() / y_test.size
print('Test accuracy after {} gradient steps: {}'.format(no_iter,accuracy))

pylab.scatter(X_test[:,0], X_test[:,1], c=y_pred)
pylab.show()

Test accuracy after 100 gradient steps: 0.8486666666666667

### The following code is intuitive but **WRONG**

import torch

x = torch.zeros(1, requires_grad=True) # Assigns a constant zero to unidimensional variable x 
                                       # requires_grad=True tells pytorch we will compute gradients for this variable

x = 3. ## Here we assign float(3.) to "x" ... which is no longer a differentiable torch tensor

print(f'Value of x = {x.data}') # Will return an error because now x is a float value
print(f'Gradient of x = {x.grad}') # Will return None because it we need to invoke .backward() first

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In[20], line 10
      6                                        # requires_grad=True tells pytorch we will compute gradients for this variable
      8 x = 3. ## Here we assign float(3.) to "x" ... which is no longer a differentiable torch tensor
---> 10 print(f'Value of x = {x.data}') # Will return an error because now x is a float value
     11 print(f'Gradient of x = {x.grad}') # Will return None because it we need to invoke .backward() first

AttributeError: 'float' object has no attribute 'data'

import torch

x = torch.zeros(1, requires_grad=True) # Assigns a constant zero to unidimensional variable x 
                                       # requires_grad=True tells pytorch we will compute gradients for this variable
x.data.fill_(3.)  

print(f'Value of x = {x.data}') # Will return 1 as a tersor variable
print(f'Gradient of x = {x.grad}') # Will return None because it we need to invoke .backward() first

Value of x = tensor([3.])
Gradient of x = None

x = torch.zeros(1, requires_grad=True) # Assigns a constant zero to unidimensional variable x 
                                       # requires_grad=True tells pytorch we will compute gradients for this variable

x.data.fill_(3.) # Constant MUST be floating point (if the tensor is an Integer, it won't work)

y = 2 * x + 1
z = 10 * y * y
z.backward()     # automatically computes the gradient ∂z/∂x evaluated at x = 1 using backpropagation
print(x.grad)    # ∂z/∂x = 280

tensor([280.])

import torch.nn as nn
import torch.nn.functional as F

class LogisticRegression(nn.Module):
    def __init__(self, input_size, output_size): ## This will also take care of initilizing the weights
        super(LogisticRegression, self).__init__() 
        self.f_x = nn.Linear(input_size, output_size)
        self.output_size = output_size

    def forward(self, x):
        if self.output_size == 1:
            y_hat = torch.sigmoid(self.f_x(x))
        else:
            y_hat = F.softmax(self.f_x(x),dim=0)  # dim is the dimension along the softmax will be computed
        return y_hat

use_cuda = torch.cuda.is_available()  # Our flag to check if Nvidia GPU is available

device = torch.device("cuda" if use_cuda else "cpu") # Decide device between CUDA (GPU) and CPU compute

print(f'Is CUDA available? {use_cuda}')

Is CUDA available? False

# Transform original data into pytorch tensors and send to device (send data to GPU if available)
torch_X_train = torch.FloatTensor(X_train).to(device)
torch_X_test = torch.FloatTensor(X_test).to(device)
torch_y_train = torch.FloatTensor(y_train).to(device)
torch_y_test = torch.FloatTensor(y_test).to(device)

print(f'Now data is in Pytorch tensor form: {torch_y_train}')

Now data is in Pytorch tensor form: tensor([1., 0., 0.,  ..., 1., 0., 1.])

##    Send model parameters to "device" (GPU if the device is on the GPU)
LR = LogisticRegression(input_size = 2, output_size = 1).to(device)

print(f'Logistic regression model created: {LR}')

Logistic regression model created: LogisticRegression(
  (f_x): Linear(in_features=2, out_features=1, bias=True)
)

y_pred = LR(torch_X_train) ### Get output of FeedForward network over the training data

print(f'Predicted values of X: {y_pred}')

Predicted values of X: tensor([[0.5371],
        [0.6012],
        [0.6617],
        ...,
        [0.5381],
        [0.6722],
        [0.6029]], grad_fn=<SigmoidBackward0>)

# Define the loss function
# Negative log-likelihood loss (aka, cross-entropy loss) for binary classes
loss = nn.BCELoss() 

## Compute the loss of the predicted values
error = loss(y_pred, torch_y_train.view(-1,1)) ## .view(-1,1) avoids a warning about the tensor dimension

# Compute the gradients of the logistic with respect to the error
##  Gradient computation performed via backpropagation

LR.zero_grad() ## Zero current gradients (Pytorch accumulate gradients by default)

error.backward() ## Compute gradients via backpropagation

## Define learning rate
learning_rate = 1e-2

print(f"Before gradient step: Layer 1 bias {LR.f_x.bias}")

## Go over all parameters and subtract their gradient
##   Subtract (not sum) the gradient since we want to minimize the loss
for f in LR.parameters():
    f.data.sub_(f.grad.data * learning_rate)

print(f"After gradient step: Layer 1 bias {LR.f_x.bias}")

Before gradient step: Layer 1 bias Parameter containing:
tensor([0.1277], requires_grad=True)
After gradient step: Layer 1 bias Parameter containing:
tensor([0.1266], requires_grad=True)

## Loop through multiple gradient steps

for epoch in range(1000):
    ## Predict values using updated model
    y_pred = LR(torch_X_train)
    
    ## Find predicted class (as a number {1,0})
    pred = y_pred.gt(0.5) + 0.0
    
    # Compute accuracy
    if epoch % 100 == 9:
        correct = pred.eq(torch_y_train.view_as(pred)).sum().item()
        print(f'Epoch {epoch} training accuracy: {int(correct/torch_y_train.shape[0] * 100)}%')

    ## Compute the new error
    error = loss(y_pred, torch_y_train.view(-1,1)) ## .view(-1,1) avoids a warning about the tensor dimension
    
    LR.zero_grad() ## Zero current gradients (Pytorch accumulate gradients by default)

    error.backward() ## Compute gradients in Pytorch via backpropagation
    
    ## Go over all parameters and subtract their gradient
    for f in LR.parameters():
        f.data.sub_(f.grad.data * learning_rate)

## Final train accuracy
if epoch % 100 == 9:
    correct = pred.eq(torch_y_train.view_as(pred)).sum().item()
    print(f'Correct {int(correct/torch_y_train.shape[0] * 100)}%')

Epoch 9 training accuracy: 71%
Epoch 109 training accuracy: 73%
Epoch 209 training accuracy: 74%
Epoch 309 training accuracy: 76%
Epoch 409 training accuracy: 77%
Epoch 509 training accuracy: 78%
Epoch 609 training accuracy: 79%
Epoch 709 training accuracy: 79%
Epoch 809 training accuracy: 80%
Epoch 909 training accuracy: 80%

# Prepare model for evaluation (needed to avoid computing gradient)
with torch.no_grad():
    # Prepare model for evaluation (needed to avoid computing gradient)
    LR.eval()
    # Predict the distribution of label
    y_hat = LR(torch_X_test)
    # Get label by picking the most probable one
    y_pred = y_hat.gt(0.5) + 0.0

    # Accuracy of predictions with the true labels and take the percentage
    # Because our dataset is balanced, measuring just the accuracy is OK
    correct = y_pred.eq(torch_y_test.view_as(y_pred)).sum().item()
    print(f'Correctly classified test examples {int(correct/torch_y_test.shape[0] * 100)}%')

    pylab.scatter(X_test[:,0], X_test[:,1], c=y_pred)
    pylab.show()

Correctly classified test examples 82%

## Try different weight initializations of the linear layers
## Not needed, but you can apply a different initialization of the weights before optimizing

def init_weights(layer):
    if type(layer) == nn.Linear:
        torch.nn.init.normal_(layer.weight)
        layer.bias.data.fill_(0.01)

LR.apply(init_weights) ## Function apply will go over all layers

LogisticRegression(
  (f_x): Linear(in_features=2, out_features=1, bias=True)
)

CS37300 - Data Mining and Machine Learning¶

Instructor: Bruno Ribeiro¶

Notes: Bruno Ribeiro¶

Logistic Regression & Perceptron¶

Learning objectives¶

1. Classification¶

Classification Example¶

2. Quiz¶

3. Perceptron algorithm¶

4. What about the Least Squares Regression for Classification?¶

Linear Regression for Classification¶

5. Logistic Regression: Solving the Linear Regression issue for classification¶

5.1. Logistic Regression for Supervised Learning (Classification)¶

5.2. Logistic Regression (LR) (for classification)¶

5.3. Data Representation¶

5.4. Knowledge Representation¶

5.5. Decision boundary between classes (we will revisit this later)¶

5.6. Logistic Regression Graphical Depiction¶

6. Hands-on Example¶

6.1. Data & Knowledge representation for two moons example¶

6.2. Model space¶

6.3. (Recap) Likelihood of Bernoulli Trials¶

6.4. Negative Log-Likelihood of Observations¶

6.5. Applying the Logistic Model¶

6.6. Model search¶

6.7. Logistic Regression Convexity¶

Start of code example¶

6.8. Model Search: Minimizing score function by gradient descent¶

6.8.1. Gradient Facts¶

6.8.2. Finding the gradients¶

6.8.3. Define one gradient descent step¶

6.9. Test learned model¶

6.10. [IMPORTANT] Always add L1 or L2 regularization to logistic regression in case training data is linearly separable¶

7. (Extra) What about multiple classes?¶

7. Multiclass Logistic Regression¶

7.1. Logistic Regression for multiple classes¶

7.2. Logistic Regression for multiple classes (Step 1): Encoding of the output $y$¶

7.3. Logistic Regression for multiple classes (Step 2): Log-likelihood¶

7.3.1. Example of Multiclass Logistic Regression for $K=2$¶

8. Logistic Regression code using Pytorch¶

Step 1: Understanding Pytorch's Automatic Differentiation¶

Step 2: Create Logistic Regression Model using Pytorch Libraries¶

Can we use a GPU?¶

Move training data into Torch tensors¶

Instantiate logistic regression¶

Logistic Regression prediction¶

Compute the Negative log-likelihood score¶

Use the loss to compute gradients¶

Define the learning rate¶

Perform one gradient step¶

Optimize the model¶

Extra: Weight initialization (we will revisit this when we talk about the optimization)¶