// Set random seed for consistent reproducibility
grand("setsd", 123);

// Function to generate synthetic data that is compatible with the real world data
function [X, y] = generate_data()
    // Generate 100 samples with 2 features
    n_samples = 100;
    
    // Generate two test scores for each student
    score1 = grand(n_samples, 1, "unf", 30, 100);
    score2 = grand(n_samples, 1, "unf", 30, 100);
    
    // Create feature matrix X with intercept term
    X = [ones(n_samples, 1), score1, score2];
    
    // Generate target variable based on a logistic model
    z = -5 + 0.08*score1 + 0.07*score2 + grand(n_samples, 1, "nor", 0, 1);
    prob = 1 ./ (1 + exp(-z));
    y = (prob > 0.5);
    
    // Display a sample of the data
    disp("Sample of the dataset (first 5 rows):");
    for i = 1:5
        printf("Score 1: %.4f, Score 2: %.4f\n", score1(i), score2(i));
    end
endfunction

// Function to standardize the data
function [X_std] = standardize_data(X)
    // Skip the intercept term (first column)
    X_std = X;
    for j = 2:size(X, 2)
        mean_j = mean(X(:, j));
        std_j = stdev(X(:, j));
        X_std(:, j) = (X(:, j) - mean_j) / std_j;
    end
endfunction

// Sigmoid function
function g = sigmoid(z)
    g = 1 ./ (1 + exp(-z));
endfunction

// Function to compute the cost (negative log-likelihood)
function J = compute_cost(X, y, theta)
    m = length(y);
    h = sigmoid(X * theta);
    J = -(1/m) * sum(y .* log(h + %eps) + (1-y) .* log(1-h + %eps));
endfunction

// Batch Gradient Descent (BGD)
function [theta, J_history] = batch_gradient_descent(X, y, theta, alpha, num_iters)
    m = length(y);
    J_history = zeros(num_iters, 1);
    
    for iter = 1:num_iters
        h = sigmoid(X * theta);
        grad = (1/m) * X' * (h - y);
        theta = theta - alpha * grad;
        
        // Save the cost J in every iteration
        J_history(iter) = compute_cost(X, y, theta);
    end

endfunction

// Stochastic Gradient Descent (SGD)
function [theta, J_history] = stochastic_gradient_descent(X, y, theta, alpha, num_iters)
    m = length(y);
    J_history = zeros(num_iters, 1);
    
    for iter = 1:num_iters
        // Randomly select one sample
        i = grand(1, 1, "uin", 1, m);
        
        h = sigmoid(X(i,:) * theta);
        grad = X(i,:)' * (h - y(i));
        theta = theta - alpha * grad;
        
        // Save the cost J in every iteration (using full dataset for comparable cost)
        J_history(iter) = compute_cost(X, y, theta);
    end

endfunction

// Mini-Batch Gradient Descent (Mini-BGD)
function [theta, J_history] = mini_batch_gradient_descent(X, y, theta, alpha, num_iters, batch_size)
    m = length(y);
    J_history = zeros(num_iters, 1);
    
    for iter = 1:num_iters
        // Randomly select batch_size samples
        indices = grand(batch_size, 1, "uin", 1, m);
        X_batch = X(indices, :);
        y_batch = y(indices);
        
        h = sigmoid(X_batch * theta);
        grad = (1/batch_size) * X_batch' * (h - y_batch);
        theta = theta - alpha * grad;
        
        // Save the cost J in every iteration (using full dataset for comparable cost)
        J_history(iter) = compute_cost(X, y, theta);
    end

endfunction

// Generate the data
[X, y] = generate_data();

// Get data dimensions
m = size(X, 1);  // Number of training examples
n = size(X, 2);  // Number of features including intercept

// Initialize parameters
initial_theta = zeros(n, 1);

// Standardize the data (excluding the intercept term)
X_std = standardize_data(X);

// Number of iterations and learning rates
num_iters_bgd = 8000;
num_iters_sgd = 8000;
num_iters_mini_bgd = 8000;

// Setting learning rates
alpha_bgd = 0.01;
alpha_sgd = 0.00002;  // Lower learning rate for SGD to prevent excessive fluctuations
alpha_sgd_high = 0.001;  // Higher learning rate for SGD to show fluctuations
alpha_sgd_low = 0.00002;  // Lower learning rate to prevent excessive fluctuations
alpha_mini_bgd = 0.01;
batch_size = 10;  // Mini-batch size

//======================Model Training ======================

// Run BGD
[theta_bgd, J_history_bgd] = batch_gradient_descent(X, y, initial_theta, alpha_bgd, num_iters_bgd);

// Run SGD with low learning rate
[theta_sgd, J_history_sgd] = stochastic_gradient_descent(X, y, initial_theta, alpha_sgd_low, num_iters_sgd);

// Run SGD with high learning rate (to show fluctuations)
[theta_sgd_high, J_history_sgd_high] = stochastic_gradient_descent(X, y, initial_theta, alpha_sgd_high, num_iters_sgd);

// Run Mini-BGD
[theta_mini_bgd, J_history_mini_bgd] = mini_batch_gradient_descent(X, y, initial_theta, alpha_mini_bgd, num_iters_mini_bgd, batch_size);

// Run with standardized data
[theta_sgd_std, J_history_sgd_std] = stochastic_gradient_descent(X_std, y, initial_theta, alpha_sgd_low, num_iters_sgd);
[theta_mini_bgd_std, J_history_mini_bgd_std] = mini_batch_gradient_descent(X_std, y, initial_theta, alpha_mini_bgd, num_iters_mini_bgd, batch_size);

//====================== Display Results ======================

// Print parameter values
printf("\nGradient Descent Algorithm Parameter Value Sample:\n");
printf("Algorithm    |    θ₀        |    θ₁        |    θ₂        \n");
printf("----------------------------------------------------------\n");
printf("BGD          | %.8f  | %.8f  | %.8f  \n", theta_bgd(1), theta_bgd(2), theta_bgd(3));
printf("SGD          | %.8f  | %.8f  | %.8f  \n", theta_sgd(1), theta_sgd(2), theta_sgd(3));
printf("Mini-BGD     | %.8f  | %.8f  | %.8f  \n", theta_mini_bgd(1), theta_mini_bgd(2), theta_mini_bgd(3));

//====================== Plot Results ======================

// Figure 1: BGD Loss vs Iterations
scf(1);
plot(1:num_iters_bgd, J_history_bgd, 'r');
xlabel('Iterations');
ylabel('Loss');
title('Fig. 1. Diagram of loss and number of iterations under BGD');

// Figure 2: SGD Loss vs Iterations
scf(2);
plot(1:num_iters_sgd, J_history_sgd, 'r');
xlabel('Iterations');
ylabel('Loss');
title('Fig. 2. Diagram of loss and number of iterations under SGD');

// Figure 3: Mini-BGD Loss vs Iterations
scf(3);
plot(1:num_iters_mini_bgd, J_history_mini_bgd, 'r');
xlabel('Iterations');
ylabel('Loss');
title('Fig. 3. Diagram of loss and number of iterations under Mini-BGD');

// Figure 4: SGD with high learning rate
scf(4);
plot(1:num_iters_sgd, J_history_sgd_high, 'r');
xlabel('Iterations');
ylabel('Loss');
title('Fig. 4. SGD, Iterations=5000, learning rate=0.001');

// Figure 5: SGD with low learning rate
scf(5);
plot(1:num_iters_sgd, J_history_sgd, 'r');
xlabel('Iterations');
ylabel('Loss');
title('Fig. 5. SGD, Iterations=15000, learning rate=0.00002');

// Figure 6: Mini-BGD with standardized data
scf(6);
plot(1:num_iters_mini_bgd, J_history_mini_bgd_std, 'r');
xlabel('Iterations');
ylabel('Loss');
title('Fig. 6. Diagram of loss and number of iterations under Mini-BGD (Standardized)');
