Python Data Analysis and Machine Learning Techniques

Posted on Dec 30, 2024 in Computers

Program 1: Data Manipulation with Pandas

This program demonstrates various data manipulation techniques using the Pandas library in Python.

Merging and Concatenating Data

  
import pandas as pd
import numpy as np

sales_data_1 = pd.DataFrame({
    'OrderID': [1, 2, 3, 4],
    'Product': ['Laptop', 'Tablet', 'Smartphone', 'Headphones'],
    'Sales': [100, 200, 2000, 800]
})

sales_data_2 = pd.DataFrame({
    'OrderID': [3, 4, 5, 6],
    'Product': ['Headphones', 'Laptop', 'Smartwatch', 'Tablet'],
    'Sales': [500, 300, 200, 900]
})

# Display the DataFrames
print("Sales Data 1:\n", sales_data_1)
print("\nSales Data 2:\n", sales_data_2)

# Merge DataFrames based on 'OrderID' using an inner join
merged_data = pd.merge(sales_data_1, sales_data_2, on='OrderID', how='inner', suffixes=('_left', '_right'))
print("\nMerged Data (Inner Join):\n", merged_data)

# Concatenate the DataFrames vertically
combined_data = pd.concat([sales_data_1, sales_data_2], ignore_index=True)
print("\nCombined Data (Concatenated Vertically):\n", combined_data)

Reshaping Data with Melt

  
# Create a sample DataFrame for reshaping
reshaping_data = pd.DataFrame({
    'Month': ['Jan', 'Feb', 'Mar'],
    'Product_A': [100, 150, 130],
    'Product_B': [90, 80, 120]
})

print("\nReshaping Data (Original):\n", reshaping_data)

# Melt the DataFrame to reshape it from wide to long format
melted_data = pd.melt(reshaping_data, id_vars=['Month'], var_name='Product', value_name='Sales')
print("\nMelted Data (Long Format):\n", melted_data)

Pivoting Data

  
# Create a sample DataFrame for pivoting
pivot_data = pd.DataFrame({
    'Month': ['Jan', 'Jan', 'Feb', 'Feb', 'Mar', 'Mar'],
    'Product': ['Product_A', 'Product_B', 'Product_A', 'Product_B', 'Product_A', 'Product_B'],
    'Sales': [100, 90, 150, 80, 130, 120]
})

print("\nPivot Data (Original):\n", pivot_data)

# Pivot the DataFrame to reshape it back to wide format
pivoted_data = pivot_data.pivot(index='Month', columns='Product', values='Sales')
print("\nPivoted Data (Wide Format):\n", pivoted_data)

Handling Missing Data

|| # Introduce some missing values || pivoted_data.loc[‘Feb’, ‘Product_A’] = np.nan || pivoted_data.loc[‘Mar’, ‘Product_B’] = np.nan || print(“\nPivoted Data with Missing Values:\n”, pivoted_data) || # Fill missing values with the mean of each column || filled_data = pivoted_data.fillna(pivoted_data.mean()) || print(“\nFilled Data (Missing Values Handled):\n”, filled_data) || print(“\nSummary Statistics of Filled Data:\n”, filled_data.describe())

PROGRAM-2

import re || #String Manupulation : || # Sample text to work with || text = ” Hello, World! Welcome to Python programming Language. ” || # 1. Strip leading and trailing spaces || clean_text = text.strip() || print(f”Original Text: ‘{text}'”) || print(f”Text after stripping spaces: ‘{clean_text}'”) || # 2. Convert the text to uppercase || upper_text = clean_text.upper() || print(f”\nText in uppercase: ‘{upper_text}'”) || # 3. Convert the text to lowercase || lower_text = clean_text.lower() || print(f”\nText in lowercase: ‘{lower_text}'”) || # 4. Count occurrences of a substring (e.g., “o”) || count_o = clean_text.count(“o”) || print(f”\nNumber of occurrences of ‘o’: {count_o}”) || # 5. Replace a word in the string || replaced_text = clean_text.replace(“Python”, “HTML”) || print(f”\nText after replacing ‘Python’ with ‘HTML’: ‘{replaced_text}'”) || # 6. Find the position of a word in the string || position_world = clean_text.find(“World”) || print(f”\nPosition of ‘World’ in the text: {position_world}”) || # 7. Split the text into words (by default on spaces) || words = clean_text.split() || print(f”\nList of words in the text: {words}”) || # 8. Join the words back into a single string || joined_text = ” “.join(words) || 8 || print(f”\nText after joining words: ‘{joined_text}'”) || # 9. Check if the text starts with “Hello” || starts_with_hello = clean_text.startswith(“Hello”) || print(f”\nDoes the text start with ‘Hello’? {starts_with_hello}”) || # 10. Check if the text ends with a specific word (e.g., “programming.”) || ends_with_programming = clean_text.endswith(“programming.”) || print(f”\nDoes the text end with ‘programming.’? {ends_with_programming}”)

PROGRAM-2 REGULAR EXPRESSION

# Sample text || text = “”” || John’s email is [warrnerjhon@gmail.com]. He said, “Python is awesome!!” It’s a great language. || Another email: [xyz@gmail.com]. || “”” || # 1. Remove special characters except for spaces and email-related characters. || # Using regex to remove non-alphabetic characters and non-email symbols || clean_text = re.sub(r”[^a-zA-Z0-9@\.\s]”, “”, text) || print(“Text after removing special characters:”) || print(clean_text) || # 2. Convert the text to lowercase || clean_text = clean_text.lower() || print(“\nText after converting to lowercase:”) || print(clean_text) || # 3. Replace multiple spaces with a single space || clean_text = re.sub(r”\s+”, ” “, clean_text) || print(“\nText after replacing multiple spaces:”) || print(clean_text) || # 4. Extract all words starting with a vowel (a, e, i, o, u) || vowel_words = re.findall(r”\b[aeiouAEIOU]\w+”, clean_text) || print(“\nWords starting with a vowel:”) || print(vowel_words) || # 5. Replace email addresses with ‘[abc@gmail.com]’ || masked_text = re.sub(r”\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b”, “[abc@gmail.com]”, clean_text) || print(“\nText after replacing emails:”) || print(masked_text)

PROGRAM-3

import pandas as pd || import numpy as np || import matplotlib.pyplot as plt || from statsmodels.tsa.holtwinters import ExponentialSmoothing || # Create sample time series data || np.random.seed(42) || date_range = pd.date_range(start=”2023-04-12″, end=”2023-04-12″, freq=”D”) || data = pd.DataFrame({ || “Date”: date_range, || “Value_A”: np.random.normal(100, 10, len(date_range)), || “Value_B”: np.random.normal(200, 20, len(date_range)), || }) || # Set Date as the index || data.set_index(“Date”, inplace=True) || # GroupBy Mechanics || def groupby_mechanics(data): || print(“\n— GroupBy Mechanics —“) || # Group data by month and calculate mean || grouped = data.resample(‘M’).mean() || print(grouped) || return grouped || # Data Formats: Vector and Multivariate || def data_formats(data): || print(“\n— Data Formats —“) || # Display data as vector || print(“\nVector Format:”) || print(data[“Value_A”].head()) || # Display multivariate time series || print(“\nMultivariate Time Series:”) || print(data.head()) || # Forecasting Example || def time_series_forecasting(data): || print(“\n— Forecasting —“) || # Select a single column for forecasting || ts = data[“Value_A”] || # Train-Test Split || train = ts[:int(0.8 * len(ts))] || test = ts[int(0.8 * len(ts)):] || # Fit the Holt-Winters Exponential Smoothing model || model = ExponentialSmoothing(train, seasonal=”add”, seasonal_periods=30).fit() || # Forecast for the test period || forecast = model.forecast(len(test)) || # Plot results || plt.figure(figsize=(12, 6)) || plt.plot(train, label=”Train”) || plt.plot(test, label=”Test”) || plt.plot(forecast, label=”Forecast”) || plt.legend() || plt.title(“Time Series Forecasting”) || plt.show() || # Main function || if __name__ == “__main__”: || print(“— Time Series Data —“) || print(data.head()) || # Grouping Mechanics || monthly_data = groupby_mechanics(data) || # Data Formats || data_formats(data) || # Time Series Forecasting || time_series_forecasting(data)

PROGRAM-4

import numpy as np || import pandas as pd || def cal_sta(data , freq): || df = pd.DataFrame({‘Value’: data, ‘Frequency’ : freq}) || total = df[‘Frequency’].sum() || df[‘Weighted_Value’] = df[‘Value’] * df[‘Frequency’] || mean = df[‘Weighted_Value’].sum() / total || cumulative_frequency = df[‘Frequency’].cumsum() || median_index = cumulative_frequency.searchsorted(total / 2) || median = df[‘Value’][median_index] || mode = df[‘Value’][df[‘Frequency’].idxmax()] || variance = np.average((df[‘Value’] – mean) ** 2, weights=df[‘Frequency’]) || std_deviation = np.sqrt(variance) || mean_deviation = np.average(np.abs(df[‘Value’] – mean), weights=df[‘Frequency’]) || q1 = np.percentile(data, 25) || q3 = np.percentile(data, 75) || quartile_deviation = (q3 – q1) / 2 || return { || ‘Mean’: mean, || ‘Median’: median, || ‘Mode’: mode, || ‘Variance’: variance, || ‘Standard Deviation’: std_deviation, || ‘Mean Deviation’: mean_deviation, || ‘Quartile Deviation’: quartile_deviation || } || data_input = input(“Enter the data values separated by commas (e.g., 10, 20, 30): “) || frequencies_input = input(“Enter the corresponding frequencies separated by commas (e.g., 1, 2, 3): “) || data = list(map(int, data_input.split(‘,’))) || frequencies = list(map(int, frequencies_input.split(‘,’))) || statistics = cal_sta(data, frequencies) || for stat, value in statistics.items(): || print(f”{stat}: {value:.2f}”)

PROGRAM-5

import numpy as np || import pandas as pd || from sklearn.model_selection import train_test_split, KFold, LeaveOneOut || from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score || from sklearn.linear_model import LinearRegression || from sklearn.datasets import fetch_california_housing || # Load the California housing dataset || data = fetch_california_housing() || X = pd.DataFrame(data.data, columns=data.feature_names) || y = pd.Series(data.target) || # Function to calculate and display metrics || def display_metrics(y_true, y_pred): || rmse = np.sqrt(mean_squared_error(y_true, y_pred)) || mae = mean_absolute_error(y_true, y_pred) || r2 = r2_score(y_true, y_pred) || print(f”Root Mean Squared Error (RMSE): {rmse:.4f}”) || print(f”Mean Absolute Error (MAE): {mae:.4f}”) || print(f”R² Score: {r2:.4f}”) || return rmse, mae, r2 || # Validation Set Approach || def validation_set_approach(X, y): || print(“Validation Set Approach:”) || # Split the dataset into training (80%) and validation (20%) sets || X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42) || # Initialize and train the model || model = LinearRegression() || model.fit(X_train, y_train) || # Make predictions on the validation set || y_pred = model.predict(X_val) || # Display metrics || display_metrics(y_val, y_pred) || # Leave-One-Out Cross-Validation (LOOCV) Approach || def loocv_approach(X, y): || print(“Leave-One-Out Cross-Validation (LOOCV):”) || loo = LeaveOneOut() || y_true, y_pred = [], [] || # Loop through each sample using LOOCV || for train_index, test_index in loo.split(X): || X_train, X_test = X.iloc[train_index], X.iloc[test_index] || y_train, y_test = y.iloc[train_index], y.iloc[test_index] || # Initialize and train the model || model = LinearRegression() || model.fit(X_train, y_train) || # Make prediction for the single test sample || y_pred.append(model.predict(X_test)[0]) || y_true.append(y_test.iloc[0]) || # Display metrics || display_metrics(y_true, y_pred) || # K-Fold Cross-Validation Approach

|| def kfold_approach(X, y, k=5): || print(f”{k}-Fold Cross-Validation Approach:”) || kf = KFold(n_splits=k, shuffle=True, random_state=42) || y_true, y_pred = [], [] || # Loop through each fold || for train_index, test_index in kf.split(X): || X_train, X_test = X.iloc[train_index], X.iloc[test_index] || y_train, y_test = y.iloc[train_index], y.iloc[test_index] || # Initialize and train the model || model = LinearRegression() || model.fit(X_train, y_train) || # Make predictions on the test set || y_pred.extend(model.predict(X_test)) || y_true.extend(y_test) || # Display metrics || display_metrics(y_true, y_pred) || # Main function to run all approaches || def main(): || print(“Cross-Validation for RMSE, MAE, and R²:\n”) || validation_set_approach(X, y) || print(“\n”) || loocv_approach(X, y) || print(“\n”) || kfold_approach(X, y, k=5) # You can change k for different K-Fold Cross-Validation || # Execute the main function || if __name__ == “__main__”: || main()

PROGRAM-6

import numpy as np || import matplotlib.pyplot as plt || from scipy.stats import norm, binom, poisson, bernoulli || def get_user_data(): || data_input = input(“Enter the data values separated by commas (e.g., 10, 20, 30): “) || frequencies_input = input(“Enter the corresponding frequencies separated by commas (e.g., 2, 3, 4): “) || data = list(map(int, data_input.split(‘,’))) || freq = list(map(int, frequencies_input.split(‘,’))) || return data, freq || def plot_normal_distribution(data, freq): || mean = np.mean(data) || std_dev = np.std(data) || x = np.linspace(min(data), max(data), 100) || pdf = norm.pdf(x, mean, std_dev) || plt.plot(x, pdf, ‘r-‘, lw=2, label=’Normal Distribution’) || plt.title(‘Normal Distribution’) || plt.xlabel(‘Value’) || plt.ylabel(‘Probability Density’) || plt.show() || def plot_binomial_distribution(data, freq): || n = max(data) || p = np.mean(data) / n || x = np.arange(0, n+1) || pmf = binom.pmf(x, n, p) || plt.bar(x, pmf, alpha=0.7, color=’b’, label=’Binomial Distribution’) || plt.title(‘Binomial Distribution’) || plt.xlabel(‘Value’) || plt.ylabel(‘Probability’) || plt.show() || def plot_poisson_distribution(data, freq): || lam = np.mean(data) || x = np.arange(0, max(data)+1) || pmf = poisson.pmf(x, lam) || plt.bar(x, pmf, alpha=0.7, color=’g’, label=’Poisson Distribution’) || plt.title(‘Poisson Distribution’) || plt.xlabel(‘Value’) || plt.ylabel(‘Probability’) || plt.show() || def plot_bernoulli_distribution(data, freq): || success_prob = np.mean(data) / max(data) || x = [0, 1] || pmf = bernoulli.pmf(x, success_prob) || plt.bar(x, pmf, alpha=0.7, color=’purple’, label=’Bernoulli Distribution’) || plt.title(‘Bernoulli Distribution’) || plt.xlabel(‘Value’) || plt.ylabel(‘Probability’) || plt.show() || def analyze_distributions(data, freq): || print(“Analyzing Normal Distribution:”) || plot_normal_distribution(data, freq) || print(“Analyzing Binomial Distribution:”) || plot_binomial_distribution(data, freq) || print(“Analyzing Poisson Distribution:”) || plot_poisson_distribution(data, freq) || print(“Analyzing Bernoulli Distribution:”) || plot_bernoulli_distribution(data, freq) || data, freq = get_user_data() || analyze_distributions(data, freq)

PROGRAM-7

import numpy as np || import pandas as pd || from scipy import stats || exam_scores = np.array([85, 87, 90, 78, 88, 95, 82, 79, 94, 91]) || group_A = np.array([85, 89, 88, 90, 93, 85, 84, 79, 90, 87]) || group_B = np.array([82, 86, 85, 87, 92, 80, 81, 78, 89, 85]) || before_treatment = np.array([82, 84, 88, 78, 80, 85, 90, 79, 87, 83]) || after_treatment = np.array([85, 87, 89, 81, 83, 88, 92, 82, 89, 86]) || def one_sample_ttest(data, population_mean): || t_stat, p_value = stats.ttest_1samp(data, population_mean) || return t_stat, p_value || def two_sample_ttest(group1, group2): || t_stat, p_value = stats.ttest_ind(group1, group2) || return t_stat, p_value || def paired_sample_ttest(before, after): || t_stat, p_value = stats.ttest_rel(before, after) || return t_stat, p_value || def analyze_ttest_results(t_stat, p_value, alpha=0.05): || print(f”T-statistic: {t_stat}”) || print(f”P-value: {p_value}”) || if p_value

PROGRAM-8 import numpy as np || import pandas as pd || from scipy.stats import f_oneway || import statsmodels.api as sm || from statsmodels.formula.api import ols || # Function for One-way ANOVA || def one_way_anova(data, groups, response): || “”” || Perform one-way ANOVA. || :param data: DataFrame containing the dataset || :param groups: Column name for grouping variable || :param response: Column name for response variable || “”” || grouped_data = [group[response].values for _, group in data.groupby(groups)] || f_stat, p_value = f_oneway(*grouped_data) || print(“\nOne-way ANOVA Results:”) || print(f”F-statistic: {f_stat:.4f}, p-value: {p_value:.4f}”) || if p_value

PROGRAM-9 # Import required libraries || import numpy as np || import pandas as pd || import seaborn as sns || import matplotlib.pyplot as plt || from scipy.stats import spearmanr || from sklearn.linear_model import LinearRegression || from sklearn.metrics import mean_squared_error || # Generate sample data (or load your dataset here) || np.random.seed(42) # For reproducibility || x = np.random.rand(100) * 100 # Random values for x || y = 2.5 * x + np.random.normal(0, 25, 100) # Linear relation with noise || # Convert data into a DataFrame || data = pd.DataFrame({‘X’: x, ‘Y’: y}) || # Compute Correlation || pearson_corr = data.corr(method=’pearson’) # Pearson Correlation || spearman_corr, _ = spearmanr(data[‘X’], data[‘Y’]) # Spearman Rank Correlation || # Linear Regression || X = data[‘X’].values.reshape(-1, 1) # Reshape for sklearn || Y = data[‘Y’].values || model = LinearRegression() || model.fit(X, Y) || Y_pred = model.predict(X) || regression_coeff = model.coef_[0] # Slope || regression_intercept = model.intercept_ # Intercept || mse = mean_squared_error(Y, Y_pred) || # Print statistical results || print(“Pearson Correlation Coefficient Matrix:”) || print(pearson_corr) || print(“\nSpearman Rank Correlation Coefficient:”, spearman_corr) || print(“\nLinear Regression Equation: Y = {:.2f}X + {:.2f}”.format(regression_coeff, regression_intercept)) || print(“Mean Squared Error (MSE):”, mse) || # Plot X-Y scatter plot with regression line || plt.figure(figsize=(8, 6)) || plt.scatter(data[‘X’], data[‘Y’], color=’blue’, label=’Data Points’) || plt.plot(data[‘X’], Y_pred, color=’red’, label=’Regression Line’) || plt.title(‘X-Y Scatter Plot with Regression Line’) || plt.xlabel(‘X’) || plt.ylabel(‘Y’) || plt.legend() || plt.show() || # Plot heatmap of correlation matrix || plt.figure(figsize=(6, 5)) || sns.heatmap(pearson_corr, annot=True, cmap=’coolwarm’, fmt=’.2f’) || plt.title(‘Heatmap of Correlation Matrix’) || plt.show()

PROGRAM-10 # Import necessary libraries || import numpy as np || import pandas as pd || import matplotlib.pyplot as plt || import seaborn as sns || from sklearn.datasets import load_breast_cancer || from sklearn.decomposition import PCA || from sklearn.preprocessing import StandardScaler || # Load the Wisconsin Breast Cancer dataset || data = load_breast_cancer() || X = data.data # Features || y = data.target # Target variable (0 = malignant, 1 = benign) || feature_names = data.feature_names || target_names = data.target_names || # Standardize the data (important for PCA) || scaler = StandardScaler() || X_scaled = scaler.fit_transform(X) || # Apply PCA || pca = PCA(n_components=2) # Reduce to 2 dimensions for visualization || X_pca = pca.fit_transform(X_scaled) || # Get explained variance ratio for each component || explained_variance_ratio = pca.explained_variance_ratio_ || # Create a DataFrame for visualization || pca_df = pd.DataFrame(X_pca, columns=[‘PCA1’, ‘PCA2’]) || pca_df[‘Target’] = y || # Plot the PCA results || plt.figure(figsize=(8, 6)) || sns.scatterplot(data=pca_df, x=’PCA1′, y=’PCA2′, hue=’Target’, palette=’Set1′, alpha=0.8) || plt.title(‘PCA of Wisconsin Breast Cancer Dataset’) || plt.xlabel(‘Principal Component 1’) || plt.ylabel(‘Principal Component 2’) || plt.legend(target_names) || plt.grid() || plt.show() || # Plot explained variance ratio || plt.figure(figsize=(8, 5)) || plt.bar(range(1, 3), explained_variance_ratio, tick_label=[‘PCA1’, ‘PCA2′], color=’skyblue’) || plt.title(‘Explained Variance Ratio of PCA Components’) || plt.xlabel(‘Principal Components’) || plt.ylabel(‘Variance Explained’) || plt.show() || # Full PCA with all components for analysis || pca_full = PCA() || X_pca_full = pca_full.fit_transform(X_scaled) || cumulative_variance = np.cumsum(pca_full.explained_variance_ratio_) || # Plot cumulative explained variance || plt.figure(figsize=(8, 5)) || plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker=’o’, linestyle=’–‘, color=’b’) || plt.title(‘Cumulative Explained Variance’) || plt.xlabel(‘Number of Principal Components’) || plt.ylabel(‘Cumulative Variance Explained’) || plt.grid() || plt.show() || # Print key insights || print(“PCA Analysis of Wisconsin Breast Cancer Dataset”)

|| print(“————————————————-“) || print(f”Explained Variance (PCA1): {explained_variance_ratio[0]:.4f}”) || print(f”Explained Variance (PCA2): {explained_variance_ratio[1]:.4f}”) || print(“Cumulative Variance Explained by All Components:”) || for i, cum_var in enumerate(cumulative_variance, start=1): || print(f” Component {i}: {cum_var:.4f}”)

PROGRAM-11

# Import necessary libraries || import numpy as np || import pandas as pd || import matplotlib.pyplot as plt || import seaborn as sns || from sklearn.datasets import load_iris || from sklearn.discriminant_analysis import LinearDiscriminantAnalysis || from sklearn.preprocessing import StandardScaler || # Load the Iris dataset || data = load_iris() || X = data.data # Features || y = data.target # Target variable (0, 1, 2) || target_names = data.target_names # Class names || # Standardize the data (LDA benefits from scaling) || scaler = StandardScaler() || X_scaled = scaler.fit_transform(X) || # Apply Linear Discriminant Analysis (LDA) || lda = LinearDiscriminantAnalysis(n_components=2) # Reduce to 2 components for visualization || X_lda = lda.fit_transform(X_scaled, y) || # Create a DataFrame for LDA-transformed data || lda_df = pd.DataFrame(X_lda, columns=[‘LDA1’, ‘LDA2’]) || lda_df[‘Target’] = y || # Plot the LDA results in 2D space || plt.figure(figsize=(8, 6)) || sns.scatterplot(data=lda_df, x=’LDA1′, y=’LDA2′, hue=’Target’, palette=’Set1′, style=’Target’, s=100) || plt.title(‘LDA of Iris Dataset’) || plt.xlabel(‘Linear Discriminant 1’) || plt.ylabel(‘Linear Discriminant 2′) || plt.legend(title=’Class’, labels=target_names) || plt.grid() || plt.show() || # Print key insights || print(“Linear Discriminant Analysis (LDA) Results”) || print(“————————————————–“) || print(“Explained Variance Ratio by LDA Components:”) || for i, ratio in enumerate(lda.explained_variance_ratio_, start=1): || print(f” LDA{i}: {ratio:.4f}”)

PROGRAM-12

# Import necessary libraries || import numpy as np || import pandas as pd || import matplotlib.pyplot as plt || import seaborn as sns || from sklearn.datasets import load_iris || from sklearn.model_selection import train_test_split || from sklearn.linear_model import LinearRegression || from sklearn.metrics import mean_squared_error, r2_score || # Load the Iris dataset || data = load_iris() || X = pd.DataFrame(data.data, columns=data.feature_names) # Features || y = X[‘petal length (cm)’] # Let’s predict ‘petal length’ as the dependent variable || X = X.drop(columns=[‘petal length (cm)’]) # Remove ‘petal length’ from independent variables || # Split the dataset into training and testing sets || X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) || # Apply Multiple Linear Regression || model = LinearRegression() || model.fit(X_train, y_train) # Train the model || # Predict on the test set || y_pred = model.predict(X_test) || # Evaluate the model || mse = mean_squared_error(y_test, y_pred) || r2 = r2_score(y_test, y_pred) || # Print model performance metrics || print(“Multiple Linear Regression Results”) || print(“———————————-“) || print(f”Mean Squared Error (MSE): {mse:.4f}”) || print(f”R-squared (R²): {r2:.4f}”) || print(“\nModel Coefficients:”) || for feature, coef in zip(X.columns, model.coef_): || print(f” {feature}: {coef:.4f}”) || print(f”Intercept: {model.intercept_:.4f}”) || # Visualize actual vs predicted values || plt.figure(figsize=(8, 6)) || plt.scatter(y_test, y_pred, color=’blue’, alpha=0.7) || plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color=’red’, linewidth=2, linestyle=’–‘) || plt.title(‘Actual vs Predicted Values (Test Set)’) || plt.xlabel(‘Actual Values’) || plt.ylabel(‘Predicted Values’) || plt.grid() || plt.show() || # Pairplot to explore relationships in the dataset || sns.pairplot(pd.DataFrame(data.data, columns=data.feature_names), diag_kind=’kde’) || plt.suptitle(‘Pairplot of Iris Dataset Features’, y=1.02) || plt.show()

Python Data Analysis and Machine Learning Techniques

Program 1: Data Manipulation with Pandas

Merging and Concatenating Data

Reshaping Data with Melt

Pivoting Data

Handling Missing Data

Recent Notes

Subjects

Publicidad