filename = uigetfile(["*housing_price.csv*"], 'file selector');   //Ask user to select CSV file
fullpathname1 = strcat(filename);   //get the full path
data = csvRead(fullpathname1);   //Read CSV file

[n, m] = size(data);           // Size of data matrix
data = data(:, 3:m);           //Excluding the first 2 columns i.e. id and date
numFeatures = size(data, 2) - 1;   // Last column is the target variable 

//PREPROCESSING STARTS
isNaNOrInf = isnan(data) | isinf(data);   //Identify elements containing NaN or Inf
validRows = sum(isNaNOrInf, 2) == 0;    // Find rows that do not contain NaN or Inf
data = data(validRows, :);      // Filter out rows with NaN or Inf
c = data(:, 4);      //Extract the no_of_bathrooms column
data(:, 4) = int32(c);   //Convert the column values to integer type
data(:, 9) = round(data(:, 9) / 2);  //Scale down the grade of house to 0-5 

// Calculate the mean and standard deviation of each feature column
featureMeans = mean(data(:, 1:numFeatures), 1);
featureStds = zeros(1, numFeatures);
for i = 1:numFeatures
    // Calculate the standard deviation
    stdDev = sqrt(mean((data(:,i) - featureMeans(i)).^2));
    if stdDev == 0
        stdDev = 1; // Avoid division by zero
    end
    featureStds(i) = stdDev;
end

// Normalize each feature column, handling NaN or Inf values
for i = 1:numFeatures
    // Subtract the mean and divide by the standard deviation
    data(:, i) = (data(:, i) - featureMeans(i)) ./ featureStds(i);
    // Check for NaN or Inf values and replace them with a default value 
    nanOrInfIndices = find(isnan(data(:,i)) | isinf(data(:,i)));
    if ~isempty(nanOrInfIndices)
        data(nanOrInfIndices, i) = 0; // Replace with a default value
    end
end
//PREPROCESSING ENDS

//Last column is the target variable
features = data(:, 1:numFeatures);
labels = data(:, numFeatures+1);

// Manually shuffle the indices
numSamples = size(features, 1);
indices = 1:numSamples;
for i = 1:numSamples
    j = floor(rand() * numSamples) + 1;
    temp = indices(i);
    indices(i) = indices(j);
    indices(j) = temp;
end

// Manually split the data into training and testing sets
numTrainSamples = floor(0.8 * numSamples);   //Splitting data into 80:20 ratio
numTestSamples = numSamples - numTrainSamples;
trainIndices = indices(1:numTrainSamples);
testIndices = indices(numTrainSamples+1:numSamples);

// Extracting training features and labels based on the shuffled indices
trainFeatures = features(trainIndices, :);
testFeatures = features(testIndices, :);
trainLabels = labels(trainIndices);
testLabels = labels(testIndices);

X_train = [ones(size(trainFeatures, 1), 1), trainFeatures];   // Add a column of ones to the feature matrix for the intercept term
theta = (X_train' * X_train) \ (X_train' * trainLabels);   // Calculate the coefficients using the normal equation
X_test = [ones(size(testFeatures, 1), 1), testFeatures];   // Add a column of ones to the test feature matrix for the intercept term

predicted_labels = X_test * theta;    // Make predictions

//Downsample the test labels and predicted labels
downsample_factor = 40; 
downsampled_testLabels = testLabels(1:downsample_factor:size(testLabels, 1));
downsampled_predicted_labels = predicted_labels(1:downsample_factor:size(predicted_labels, 1));

//Line plot of the actual vs. predicted values
figure(1);
plot(downsampled_testLabels, 'b', 'LineWidth', 2);
plot(downsampled_predicted_labels, 'r', 'LineWidth', 2);
legend('Actual Values', 'Predicted Values');
xtitle('Actual vs. Predicted Values', 'Index', 'Values');


// Display the labels and predicted values for testing data
indices_to_display = 1:10;  //Display the first 10 rows only
disp('Test Labels   Predicted Values');
disp([string(downsampled_testLabels(indices_to_display))+"          "+string(downsampled_predicted_labels(indices_to_display))]);

// Function to calculate Root Mean Squared Error (RMSE)
function rmse = calculate_rmse(y_true, y_pred)
    rmse = sqrt(mean((y_true - y_pred).^2));
endfunction

// Function to calculate Mean Absolute Error (MAE)
function mae = calculate_mae(y_true, y_pred)
    mae = mean(abs(y_true - y_pred));
endfunction

// Function to calculate R-squared (Coefficient of Determination)
function r_squared = calculate_r_squared(y_true, y_pred)
    ssr = sum((y_true - y_pred).^2);
    sst = sum((y_true - mean(y_true)).^2);
    r_squared = 1 - (ssr / sst);
endfunction

// Function to calculate Adjusted R-squared
function adj_r_squared = calculate_adj_r_squared(y_true, y_pred, n, p)
    ssr = sum((y_true - y_pred).^2);
    sst = sum((y_true - mean(y_true)).^2);
    r_squared = 1 - (ssr / sst);
    adj_r_squared = 1 - ((1 - r_squared) * ((n - 1) / (n - p - 1)));
endfunction

// Calculate metrics
rmse = calculate_rmse(testLabels, predicted_labels);
mae = calculate_mae(testLabels, predicted_labels);
r_squared = calculate_r_squared(testLabels, predicted_labels);
adj_r_squared = calculate_adj_r_squared(testLabels, predicted_labels, numTestSamples, numFeatures);

// Display the metrics
disp("RMSE:", rmse);
disp("MAE:", mae);
disp("R-squared: ", r_squared);
disp("Adjusted R-squared: ", adj_r_squared);

numFeatures = 20;  // Number of features
featureNames = ["number_of_bedrooms", 
                "number_of_bathrooms", 
                "living_area (in sq.ft)", 
                "lot_area (in sq.ft)", 
                "number_of_floors", 
                "waterfront_present (Binary, 0 for no waterfront, 1 for waterfront)", 
                "number_of_views", 
                "condition_of_the_house (Integer, Range: 0 to 5, where 5 represents the best condition)", 
                "grade_of_the_house (Integer, Range: 0 to 5, where 5 represents the best grade)", 
                "area_of_the_house_excluding_basement ()in sq.ft)", 
                "area_of_the_basement (in sq.ft)", 
                "built_year (in years)", 
                "renovation_year (in years)", 
                "postal_code (6 digit Integer)", 
                "latitude (Float)", 
                "longitude (Float)", 
                "living_area_renov (in sq.ft)", 
                "lot_area_renov (in sq.ft)", 
                "number_of_schools_nearby ", 
                "distance_from_the_airport (in kilometers)"];


new_row = zeros(1, numFeatures);   // Initialize a row for the new data

// Prompt the user to enter values for each feature
for i = 1:numFeatures
    new_row(i) = input("Enter value for feature " + featureNames(i) + ": ");
end

//Pre-process the new data
new_row_cleaned = new_row(~isnan(new_row));   // Remove null and NaN values from the new row
new_row_preprocessed = zeros(1, numFeatures);
for i = 1:numFeatures
    new_row_preprocessed(i) = (new_row(i) - featureMeans(i)) / featureStds(i);
end
new_data_with_intercept = [1, new_row_preprocessed];   // Add a column of ones for the intercept term

// Make a prediction using the trained model
predicted_value = new_data_with_intercept * theta;
price_in_lakhs = predicted_value / 1e5;
price_in_crores = predicted_value / 1e7;

// Display the predicted value
disp('The predicted price of the house/property is ' + string(predicted_value) + '(INR), which translates to ' + string(price_in_lakhs) + ' lakhs OR ' + string(price_in_crores) + ' crores.')
abort();
