Data Analysis Techniques in Python: Merging, Reshaping, and More

Posted on Dec 30, 2024 in Computers

1) Import pandas as pd
import numpy as np

# Create two sample DataFrames
sales_data_1 = pd.DataFrame({
‘OrderID’: [1, 2, 3, 4],
‘Product’: [‘Laptop’, ‘Tablet’, ‘Smartphone’, ‘Headphones’],
‘Sales’: [100, 200, 2000, 800]
})

sales_data_2 = pd.DataFrame({
‘OrderID’: [3, 4, 5, 6],
‘Product’: [‘Headphones’, ‘Laptop’, ‘Smartwatch’, ‘Tablet’],
‘Sales’: [500, 300, 200, 900]
})

# Display the DataFrames
print(“Sales Data 1:\n”, sales_data_1)
print(“\nSales Data 2:\n”, sales_data_2)

# Merge DataFrames based on ‘OrderID’ using an inner join
merged_data = pd.merge(sales_data_1, sales_data_2, on=’OrderID’, how=’inner’,
suffixes=(‘_left’, ‘_right’))
print(“\nMerged Data (Inner Join):\n”, merged_data)

# Concatenate the DataFrames vertically
combined_data = pd.concat([sales_data_1, sales_data_2], ignore_index=True)
print(“\nCombined Data (Concatenated Vertically):\n”, combined_data)

# 2. Reshaping Data with Melt
# Create a sample DataFrame for reshaping
reshaping_data = pd.DataFrame({
‘Month’: [‘Jan’, ‘Feb’, ‘Mar’],
‘Product_A’: [100, 150, 130],
‘Product_B’: [90, 80, 120]
})

print(“\nReshaping Data (Original):\n”, reshaping_data)

# Melt the DataFrame to reshape it from wide to long format
melted_data = pd.melt(reshaping_data, id_vars=[‘Month’], var_name=’Product’,
value_name=’Sales’)
print(“\nMelted Data (Long Format):\n”, melted_data)

# 3. Pivoting Data

# Create a sample DataFrame for pivoting
pivot_data = pd.DataFrame({
‘Month’: [‘Jan’, ‘Jan’, ‘Feb’, ‘Feb’, ‘Mar’, ‘Mar’],
‘Product’: [‘Product_A’, ‘Product_B’, ‘Product_A’, ‘Product_B’, ‘Product_A’,
‘Product_B’],
‘Sales’: [100, 90, 150, 80, 130, 120]
})

print(“\nPivot Data (Original):\n”, pivot_data)

# Pivot the DataFrame to reshape it back to wide format
pivoted_data = pivot_data.pivot(index=’Month’, columns=’Product’, values=’Sales’)
print(“\nPivoted Data (Wide Format):\n”, pivoted_data)

# 4. Handling Missing Data
# Introduce some missing values
pivoted_data.loc[‘Feb’, ‘Product_A’] = np.nan
pivoted_data.loc[‘Mar’, ‘Product_B’] = np.nan
print(“\nPivoted Data with Missing Values:\n”, pivoted_data)
# Fill missing values with the mean of each column
filled_data = pivoted_data.fillna(pivoted_data.mean())
print(“\nFilled Data (Missing Values Handled):\n”, filled_data)

# 5. Summary Statistics
print(“\nSummary Statistics of Filled Data:\n”, filled_data.describe())

2) # Sample text to work with
text = ” Hello, World! Welcome to Python programming language. ”

# 1. Strip leading and trailing spaces
clean_text = text.strip()
print(f”Original Text: ‘{text}'”)
print(f”Text after stripping spaces: ‘{clean_text}'”)
# 2. Convert the text to uppercase
upper_text = clean_text.upper()
print(f”\nText in uppercase: ‘{upper_text}'”)
# 3. Convert the text to lowercase
lower_text = clean_text.lower()
print(f”\nText in lowercase: ‘{lower_text}'”)

# 4. Count occurrences of a substring (e.g., “o”)
count_o = clean_text.count(“o”)
print(f”\nNumber of occurrences of ‘o’: {count_o}”)

# 5. Replace a word in the string
replaced_text = clean_text.replace(“Python”, “HTML”)
print(f”\nText after replacing ‘Python’ with ‘HTML’: ‘{replaced_text}'”)

# 6. Find the position of a word in the string
position_world = clean_text.find(“World”)
print(f”\nPosition of ‘World’ in the text: {position_world}”)
# 7. Split the text into words (by default on spaces)
words = clean_text.split()
print(f”\nList of words in the text: {words}”)
# 8. Join the words back into a single string
joined_text = ” “.join(words)
print(f”\nText after joining words: ‘{joined_text}'”)
# 9. Check if the text starts with “Hello”
starts_with_hello = clean_text.startswith(“Hello”)
print(f”\nDoes the text start with ‘Hello’? {starts_with_hello}”)
# 10. Check if the text ends with a specific word (e.g., “programming.”)
ends_with_programming = clean_text.endswith(“programming.”)
print(f”\nDoes the text end with ‘programming.’? {ends_with_programming}”)

# Sample text
text = “””
John’s email is [warrnerjhon@gmail.com]. He said, “Python is awesome!!” It’s a great
language.
Another email: [xyz@gmail.com].
“””

# 1. Remove special characters except for spaces and email-related characters.
# Using regex to remove non-alphabetic characters and non-email symbols
clean_text = re.sub(r”[^a-zA-Z0-9@\.\s]”, “”, text)
print(“Text after removing special characters:”)
print(clean_text)

# 2. Convert the text to lowercase
clean_text = clean_text.lower()
print(“\nText after converting to lowercase:”)
print(clean_text)

# 3. Replace multiple spaces with a single space
clean_text = re.sub(r”\s+”, ” “, clean_text)
print(“\nText after replacing multiple spaces:”)
print(clean_text)

# 4. Extract all words starting with a vowel (a, e, i, o, u)
vowel_words = re.findall(r”\b[aeiouAEIOU]\w+”, clean_text)
print(“\nWords starting with a vowel:”)
print(vowel_words)

# 5. Replace email addresses with ‘[abc@gmail.com]’
masked_text = re.sub(r”\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b”,
“[abc@gmail.com]”, clean_text)
print(“\nText after replacing emails:”)
print(masked_text)

3) import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.holtwinters import ExponentialSmoothing
# Create sample time series data
np.random.seed(42)
date_range = pd.date_range(start=”2023-04-12″, end=”2023-04-12″, freq=”D”)
data = pd.DataFrame({
“Date”: date_range,
“Value_A”: np.random.normal(100, 10, len(date_range)),
“Value_B”: np.random.normal(200, 20, len(date_range)),
})
# Set Date as the index
data.set_index(“Date”, inplace=True)
# GroupBy Mechanics
def groupby_mechanics(data):
print(“\n— GroupBy Mechanics —“)
# Group data by month and calculate mean
grouped = data.resample(‘M’).mean()
print(grouped)
return grouped
# Data Formats: Vector and Multivariate
def data_formats(data):
print(“\n— Data Formats —“)
# Display data as vector
print(“\nVector Format:”)
print(data[“Value_A”].head())
# Display multivariate time series
print(“\nMultivariate Time Series:”)
print(data.head())
# Forecasting Example
def time_series_forecasting(data):
print(“\n— Forecasting —“)
# Select a single column for forecasting
ts = data[“Value_A”]

# Train-Test Split
train = ts[:int(0.8 * len(ts))]
test = ts[int(0.8 * len(ts)):]

# Fit the Holt-Winters Exponential Smoothing model
model = ExponentialSmoothing(train, seasonal=”add”, seasonal_periods=30).fit()

# Forecast for the test period
forecast = model.forecast(len(test))

# Plot results
plt.figure(figsize=(12, 6))
plt.plot(train, label=”Train”)
plt.plot(test, label=”Test”)
plt.plot(forecast, label=”Forecast”)
plt.legend()
plt.title(“Time Series Forecasting”)
plt.show()
# Main function
if __name__ == “__main__”:
print(“— Time Series Data —“)
print(data.head())
# Grouping Mechanics
monthly_data = groupby_mechanics(data)
# Data Formats
data_formats(data)
# Time Series Forecasting
time_series_forecasting(data)

import numpy as np
from scipy import stats

# Data and corresponding frequencies
data = [10,11,12,13,14]
frequency = [1,2,1,3,2]

# Expand data based on frequencies
expanded_data = np.repeat(data, frequency)

# Calculations
mean = np.mean(expanded_data)
median = np.median(expanded_data)
mode = stats.mode(expanded_data)
std_dev = np.std(expanded_data)
variance = np.var(expanded_data)
mean_deviation = np.mean(np.abs(expanded_data – mean))

q1 = np.percentile(expanded_data, 25)
q3 = np.percentile(expanded_data, 75)
quartile_deviation = (q3 – q1) / 2

# Results
print(f”Mean: {mean:.2f}”)
print(f”Median: {median}”)
print(f”Mode: {mode[0]} “)
print(f”Standard Deviation: {std_dev:.2f}”)
print(f”Variance: {variance:.2f}”)
print(f”Mean Deviation: {mean_deviation:.2f}”)
print(f”Quartile Deviation: {quartile_deviation}”)

7) import numpy as np
import pandas as pd
from scipy import stats
exam_scores = np.array([85, 87, 90, 78, 88, 95, 82, 79, 94, 91])
group_A = np.array([85, 89, 88, 90, 93, 85, 84, 79, 90, 87])
group_B = np.array([82, 86, 85, 87, 92, 80, 81, 78, 89, 85])
before_treatment = np.array([82, 84, 88, 78, 80, 85, 90, 79, 87, 83])
after_treatment = np.array([85, 87, 89, 81, 83, 88, 92, 82, 89, 86])
def one_sample_ttest(data, population_mean):
t_stat, p_value = stats.ttest_1samp(data, population_mean)
return t_stat, p_value
def two_sample_ttest(group1, group2):
t_stat, p_value = stats.ttest_ind(group1, group2)
return t_stat, p_value
def paired_sample_ttest(before, after):
t_stat, p_value = stats.ttest_rel(before, after)
return t_stat, p_value
def analyze_ttest_results(t_stat, p_value, alpha=0.05):
print(f”T-statistic: {t_stat}”)
print(f”P-value: {p_value}”)
if p_value < alpha:
print(“Result: The null hypothesis is rejected (statistically significant difference).”)
else:
print(“Result: The null hypothesis cannot be rejected (no statistically significant
difference).”)
print(“One-Sample T-Test:”)
t_stat, p_value = one_sample_ttest(exam_scores, 85)
analyze_ttest_results(t_stat, p_value)

print()
print(“Two-Sample T-Test:”)
t_stat, p_value = two_sample_ttest(group_A, group_B)
analyze_ttest_results(t_stat, p_value)
print()
print(“Paired-Sample T-Test:”)
t_stat, p_value = paired_sample_ttest(before_treatment, after_treatment)
analyze_ttest_results(t_stat, p_value)

5) import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, LeaveOneOut
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.datasets import fetch_california_housing
# Load the California housing dataset
data = fetch_california_housing()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)
# Function to calculate and display metrics
def display_metrics(y_true, y_pred):
rmse = np.sqrt(mean_squared_error(y_true, y_pred))
mae = mean_absolute_error(y_true, y_pred)
r2 = r2_score(y_true, y_pred)
print(f”Root Mean Squared Error (RMSE): {rmse:.4f}”)
print(f”Mean Absolute Error (MAE): {mae:.4f}”)
print(f”R² Score: {r2:.4f}”)
return rmse, mae, r2
# Validation Set Approach
def validation_set_approach(X, y):
print(“Validation Set Approach:”)
# Split the dataset into training (80%) and validation (20%) sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
# Initialize and train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the validation set
y_pred = model.predict(X_val)
display_metrics(y_val, y_pred) # Display metrics
# Leave-One-Out Cross-Validation (LOOCV) Approach
def loocv_approach(X, y):
print(“Leave-One-Out Cross-Validation (LOOCV):”)
loo = LeaveOneOut()
y_true, y_pred = [], []
# Loop through each sample using LOOCV
for train_index, test_index in loo.split(X):
X_train, X_test = X.iloc[train_index], X.iloc[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]
# Initialize and train the model
model = LinearRegression()
model.fit(X_train, y_train)
# Make prediction for the single test sample
y_pred.append(model.predict(X_test)[0])
y_true.append(y_test.iloc[0])
display_metrics(y_true, y_pred) # Display metrics
# K-Fold Cross-Validation Approach
def kfold_approach(X, y, k=5):
print(f”{k}-Fold Cross-Validation Approach:”)
kf = KFold(n_splits=k, shuffle=True, random_state=42)
y_true, y_pred = [], []
# Loop through each fold
for train_index, test_index in kf.split(X):
X_train, X_test = X.iloc[train_index], X.iloc[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]
# Initialize and train the model
model = LinearRegression()
model.fit(X_train, y_train)
# Make predictions on the test set
y_pred.extend(model.predict(X_test))
y_true.extend(y_test)
display_metrics(y_true, y_pred) # Display metrics
def main(): # Main function to run all approaches
print(“Cross-Validation for RMSE, MAE, and R²:\n”)
validation_set_approach(X, y) print(“\n”)
loocv_approach(X, y) print(“\n”)
kfold_approach(X, y, k=5
if __name__ == “__main__”:
main()

6) import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm, binom, poisson, bernoulli
def get_user_data():
data_input = input(“Enter the data values separated by commas (e.g., 10, 20, 30): “)
frequencies_input = input(“Enter the corresponding frequencies separated by commas (e.g., 2,
3, 4): “)
data = list(map(int, data_input.split(‘,’)))
freq = list(map(int, frequencies_input.split(‘,’)))
return data, freq
def plot_normal_distribution(data, freq):
mean = np.mean(data)
std_dev = np.std(data)
x = np.linspace(min(data), max(data), 100)
pdf = norm.pdf(x, mean, std_dev)
plt.plot(x, pdf, ‘r-‘, lw=2, label=’Normal Distribution’)
plt.title(‘Normal Distribution’)
plt.xlabel(‘Value’)
plt.ylabel(‘Probability Density’)
plt.show()
def plot_binomial_distribution(data, freq):
n = max(data)
p = np.mean(data) / n
x = np.arange(0, n+1)
pmf = binom.pmf(x, n, p)
plt.bar(x, pmf, alpha=0.7, color=’b’, label=’Binomial Distribution’)
plt.title(‘Binomial Distribution’)
plt.xlabel(‘Value’)
plt.ylabel(‘Probability’)
plt.show()
def plot_poisson_distribution(data, freq):
lam = np.mean(data)
x = np.arange(0, max(data)+1)
pmf = poisson.pmf(x, lam)
plt.bar(x, pmf, alpha=0.7, color=’g’, label=’Poisson Distribution’)
plt.title(‘Poisson Distribution’)
plt.xlabel(‘Value’)
plt.ylabel(‘Probability’)
plt.show()

def plot_bernoulli_distribution(data, freq):
success_prob = np.mean(data) / max(data)
x = [0, 1]
pmf = bernoulli.pmf(x, success_prob)
plt.bar(x, pmf, alpha=0.7, color=’purple’, label=’Bernoulli Distribution’)
plt.title(‘Bernoulli Distribution’)
plt.xlabel(‘Value’)
plt.ylabel(‘Probability’)
plt.show()
def analyze_distributions(data, freq):
print(“Analyzing Normal Distribution:”)
plot_normal_distribution(data, freq)
print(“Analyzing Binomial Distribution:”)
plot_binomial_distribution(data, freq)
print(“Analyzing Poisson Distribution:”)
plot_poisson_distribution(data, freq)
print(“Analyzing Bernoulli Distribution:”)
plot_bernoulli_distribution(data, freq)
data, freq = get_user_data()
analyze_distributions(data, freq)

8) import numpy as np
import pandas as pd
from scipy.stats import f_oneway
import statsmodels.api as sm
from statsmodels.formula.api import ols
# Function for One-way ANOVA
def one_way_anova(data, groups, response):
“””
Perform one-way ANOVA.
:param data: DataFrame containing the dataset
:param groups: Column name for grouping variable
:param response: Column name for response variable
“””
grouped_data = [group[response].values for _, group in data.groupby(groups)]
f_stat, p_value = f_oneway(*grouped_data)
print(“\nOne-way ANOVA Results:”)
print(f”F-statistic: {f_stat:.4f}, p-value: {p_value:.4f}”)
if p_value < 0.05:

print(“Reject the null hypothesis: Significant difference among group means.”)
else:
print(“Fail to reject the null hypothesis: No significant difference among group means.”)
# Function for Two-way ANOVA
def two_way_anova(data, response, factor1, factor2):
“””
Perform two-way ANOVA.
:param data: DataFrame containing the dataset
:param response: Column name for response variable
:param factor1: Column name for first factor
:param factor2: Column name for second factor
“””
formula = f”{response} ~ C({factor1}) + C({factor2}) + C({factor1}):C({factor2})”
model = ols(formula, data).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
print(“\nTwo-way ANOVA Results:”)
print(anova_table)
if __name__ == “__main__”:
# Example dataset for One-way ANOVA
data_one_way = pd.DataFrame({
“Group”: np.repeat([‘A’, ‘B’, ‘C’], 10),
“Score”: np.concatenate([ll np.random.normal(loc=50, scale=5, 0), ll np.random.normal(loc=55, scale=5, 0), ll
np.random.normal(loc=60, scale=5, 0)
]) ll }) ll # Perform One-way ANOVA
one_way_anova(data_one_way, groups=”Group”, response=”Score”)
# Example dataset for Two-way ANOVA
data_two_way = pd.DataFrame({
“Factor1”: np.repeat([‘Low’, ‘Medium’, ‘High’], 6),
“Factor2”: np.tile([‘Type1’, ‘Type2’], 9),
“Response”: np.concatenate([
np.random.normal(loc=50, scale=5, size=6),
np.random.normal(loc=55, scale=5, size=6),
np.random.normal(loc=60, scale=5, size=6)
]) ll }) ll
# Perform Two-way ANOVA ll two_way_anova(data_two_way, response=”Response”, factor1=”Factor1″, factor2=”Factor2″)

9)# Import required libraries ll import numpy as np ll import pandas as pd ll import seaborn as sns ll import matplotlib.pyplot as plt ll from scipy.stats import spearmanr ll from sklearn.linear_model import LinearRegression ll from sklearn.metrics import mean_squared_error ll
# Generate sample data (or load your dataset here) ll np.random.seed(42) ll # For reproducibility ll x = np.random.rand(100) * 100 ll # Random values for x ll y = 2.5 * x + np.random.normal(0, 25, 100) ll # Linear relation with noise ll # Convert data into a DataFrame ll data = pd.DataFrame({‘X’: x, ‘Y’: y}) ll # Compute Correlation ll pearson_corr = data.corr(method=’pearson’) ll # Pearson Correlation ll spearman_corr, _ = spearmanr(data[‘X’], data[‘Y’]) ll # Spearman Rank Correlation ll # Linear Regression ll X = data[‘X’].values.reshape(-1, 1) ll # Reshape for sklearn ll Y = data[‘Y’].values ll model = LinearRegression() ll
model.fit(X, Y)
Y_pred = model.predict(X)
regression_coeff = model.coef_[0]
regression_intercept = model.intercept_
mse = mean_squared_error(Y, Y_pred)
# Print statistical results
print(“Pearson Correlation Coefficient Matrix:”)
print(pearson_corr)
print(“\nSpearman Rank Correlation Coefficient:”, spearman_corr)
print(“\nLinear Regression Equation: Y = {:.2f}X + {:.2f}”.format(regression_coeff,
regression_intercept))
print(“Mean Squared Error (MSE):”, mse)
# Plot X-Y scatter plot with regression line
plt.figure(figsize=(8, 6))
plt.scatter(data[‘X’], data[‘Y’], color=’blue’, label=’Data Points’)
plt.plot(data[‘X’], Y_pred, color=’red’, label=’Regression Line’)
plt.title(‘X-Y Scatter Plot with Regression Line’)
plt.xlabel(‘X’) ll plt.ylabel(‘Y’) ll plt.legend() ll plt.show() ll
# Plot heatmap of correlation matrix
plt.figure(figsize=(6, 5))
sns.heatmap(pearson_corr, annot=True, cmap=’coolwarm’, fmt=’.2f’)
plt.title(‘Heatmap of Correlation Matrix’)
plt.show()

10)# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_breast_cancer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
# Load the Wisconsin Breast Cancer dataset
data = load_breast_cancer()
X = data.data
y = data.target
# Target variable (0 = malignant, 1 = benign)
feature_names = data.feature_names
target_names = data.target_names
# Standardize the data (important for PCA)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Apply PCA
pca = PCA(n_components=2)
# Reduce to 2 dimensions for visualization
X_pca = pca.fit_transform(X_scaled)
# Get explained variance ratio for each component
explained_variance_ratio = pca.explained_variance_ratio_
# Create a DataFrame for visualization
pca_df = pd.DataFrame(X_pca, columns=[‘PCA1’, ‘PCA2’])
pca_df[‘Target’] = y
# Plot the PCA results
plt.figure(figsize=(8, 6))
sns.scatterplot(data=pca_df, x=’PCA1′, y=’PCA2′, hue=’Target’, palette=’Set1′, alpha=0.8)
plt.title(‘PCA of Wisconsin Breast Cancer Dataset’)
plt.xlabel(‘Principal Component 1’)
plt.ylabel(‘Principal Component 2’)
plt.legend(target_names)
plt.grid()
plt.show()
# Plot explained variance ratio
plt.figure(figsize=(8, 5))
plt.bar(range(1, 3), explained_variance_ratio, tick_label=[‘PCA1’, ‘PCA2′], color=’skyblue’)
plt.title(‘Explained Variance Ratio of PCA Components’)
plt.xlabel(‘Principal Components’)

plt.ylabel(‘Variance Explained’)
plt.show()
# Full PCA with all components for analysis
pca_full = PCA()
X_pca_full = pca_full.fit_transform(X_scaled)
cumulative_variance = np.cumsum(pca_full.explained_variance_ratio_)
# Plot cumulative explained variance
plt.figure(figsize=(8, 5))
plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker=’o’, linestyle=’–‘,
color=’b’)
plt.title(‘Cumulative Explained Variance’)
plt.xlabel(‘Number of Principal Components’)
plt.ylabel(‘Cumulative Variance Explained’)
plt.grid()
plt.show()
# Print key insights
print(“PCA Analysis of Wisconsin Breast Cancer Dataset”)
print(“————————————————-“)
print(f”Explained Variance (PCA1): {explained_variance_ratio[0]:.4f}”)
print(f”Explained Variance (PCA2): {explained_variance_ratio[1]:.4f}”)
print(“Cumulative Variance Explained by All Components:”)
for i, cum_var in enumerate(cumulative_variance, start=1):
print(f” Component {i}: {cum_var:.4f}”)

11)# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import StandardScaler
# Load the Iris dataset
data = load_iris()
X = data.data
# Features
y = data.target
# Target variable (0, 1, 2)
target_names = data.target_names
# Standardize the data (LDA benefits from scaling)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Apply Linear Discriminant Analysis (LDA)
lda = LinearDiscriminantAnalysis(n_components=2)
# Reduce to 2 components for visualization
X_lda = lda.fit_transform(X_scaled, y)
# Create a DataFrame for LDA-transformed data
lda_df = pd.DataFrame(X_lda, columns=[‘LDA1’, ‘LDA2’])
lda_df[‘Target’] = y
# Plot the LDA results in 2D space
plt.figure(figsize=(8, 6))
sns.scatterplot(data=lda_df, x=’LDA1′, y=’LDA2′, hue=’Target’, palette=’Set1′, style=’Target’,
s=100)
plt.title(‘LDA of Iris Dataset’)
plt.xlabel(‘Linear Discriminant 1’)
plt.ylabel(‘Linear Discriminant 2’)
plt.legend(title=’Class’, labels=target_names)
plt.grid()
plt.show()
# Print key insights
print(“Linear Discriminant Analysis (LDA) Results”)
print(“————————————————–“)
print(“Explained Variance Ratio by LDA Components:”)
for i, ratio in enumerate(lda.explained_variance_ratio_, start=1):
print(f” LDA{i}: {ratio:.4f}”)

12)# Import necessary libraries ll import numpy as np ll import pandas as pd ll import matplotlib.pyplot as plt ll import seaborn as sns ll from sklearn.datasets import load_iris ll from sklearn.model_selection import train_test_split ll from sklearn.linear_model import LinearRegression ll from sklearn.metrics import mean_squared_error, r2_score ll # Load the Iris dataset ll data = load_iris() ll X = pd.DataFrame(data.data, columns=data.feature_names) ll # Features ll y = X[‘petal length (cm)’] ll # Let’s predict ‘petal length’ as the dependent variable ll X = X.drop(columns=[‘petal length (cm)’]) ll # Split the dataset into training and testing sets ll X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) ll # Apply Multiple Linear Regression ll model = LinearRegression() ll model.fit(X_train, y_train) ll # Train the model ll # Predict on the test set ll y_pred = model.predict(X_test) ll # Evaluate the model ll mse = mean_squared_error(y_test, y_pred) ll r2 = r2_score(y_test, y_pred) ll
# Print model performance metrics ll print(“Multiple Linear Regression Results”) ll print(“———————————-“) ll print(f”Mean Squared Error (MSE): {mse:.4f}”) ll print(f”R-squared (R²): {r2:.4f}”) ll
print(“\nModel Coefficients:”)
for feature, coef in zip(X.columns, model.coef_):
print(f” {feature}: {coef:.4f}”)
print(f”Intercept: {model.intercept_:.4f}”)
# Visualize actual vs predicted values
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, color=’blue’, alpha=0.7)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color=’red’, linewidth=2,
linestyle=’–‘) ll plt.title(‘Actual vs Predicted Values (Test Set)’) ll plt.xlabel(‘Actual Values’) ll
plt.ylabel(‘Predicted Values’) ll plt.grid()ll plt.show() ll
# Pairplot to explore relationships in the dataset
sns.pairplot(pd.DataFrame(data.data, columns=data.feature_names), diag_kind=’kde’)
plt.suptitle(‘Pairplot of Iris Dataset Features’, y=1.02)
plt.show()

Data Analysis Techniques in Python: Merging, Reshaping, and More

Recent Notes

Subjects

Publicidad