# Libraries
import pandas as pd
import numpy as np
import scipy.stats as st
import matplotlib.pyplot as plt
import seaborn as sns

# Data
df = pd.read_excel('my_data.xlsx')
df['index'] = df.index
df.head()

# Isolate the last twenty and the to last twenty
last_twenty = df['scores'].iloc[-20:]
first_twenty = df['scores'].iloc[-40:-20]

df_new = pd.DataFrame(columns=['first_twenty', 'last_twenty'], index=range(20))
df_new['first_twenty'] = first_twenty.values
df_new['last_twenty'] = last_twenty.values

df_new.head(20)

first_twenty_mean = np.mean(first_twenty)
last_twenty_mean = np.mean(last_twenty)
print(f'The average score of the first twenty samples is: {first_twenty_mean}')
print(f'The average score of the last twenty samples is: {last_twenty_mean}')

The average score of the first twenty samples is: 34.75
The average score of the last twenty samples is: 43.15

df_new.describe().T

x = range(20)
print(x)

range(0, 20)

# Create the plots
plt.plot(x, first_twenty, 'o', label='next_twenty')
plt.plot(x, last_twenty, 'o', label='last_twenty')

# Labels and Titles
plt.xlabel('observation')
plt.ylabel('score')
plt.title('Scores')

# Legend
plt.legend()

# Show plot
plt.grid(True)
plt.show()

# Plotly interactive graph
import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Scatter(x=df_new.index, y=df_new['first_twenty'], mode='markers', name="first_twenty"))
fig.add_trace(go.Scatter(x=df_new.index, y=df_new['last_twenty'], mode='markers', name="last_twenty"))
fig.update_layout(title='Point plot')
fig.show()

# Export method for embedding html into personal website
with open('hypot_point.html', 'w') as f:
    f.write(fig.to_html(include_plotlyjs='cdn'))

# Line plot 
plt.figure(figsize=(12,6))
plt.plot(df_new['first_twenty'], label='first_twenty', marker='o')
plt.plot(df_new['last_twenty'], label='last_twenty', marker='x')
plt.title('Trend Comparison Over Time')
plt.xlabel('observation')
plt.ylabel('score')
plt.legend()
plt.grid(True)
plt.show()

# Plotly interactive graph
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_new.index, y=df_new['first_twenty'], mode='lines', name="first_twenty"))
fig.add_trace(go.Scatter(x=df_new.index, y=df_new['last_twenty'], mode='lines', name="last_twenty"))
fig.update_layout(title='Line chart')
fig.show()

# Export method for embedding html into personal website
with open('hypot_line.html', 'w') as f:
    f.write(fig.to_html(include_plotlyjs='cdn'))

df_new.boxplot()

<Axes: >

# Plotly interactive graph
fig = go.Figure()
fig.add_trace(go.Box(y=df_new['first_twenty'], name="first_twenty", boxpoints='all'))
fig.add_trace(go.Box(y=df_new['last_twenty'], name="last_twenty", boxpoints='all'))
fig.update_layout(title='Boxplot')
fig.show()

# Export method for embedding html into personal website
with open('hypot_box.html', 'w') as f:
    f.write(fig.to_html(include_plotlyjs='cdn'))

# Histogram for frequency distribution
plt.figure(figsize=(12, 6))
df_new.plot(kind='hist', alpha=0.7, bins=15)
plt.title('Frequency Distribution of Trends')
plt.xlabel('Values')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

<Figure size 1200x600 with 0 Axes>

# Plotly histogram

fig = go.Figure()
fig.add_trace(go.Histogram(x=first_twenty, name="first_twenty"))
fig.add_trace(go.Histogram(x=last_twenty, name="last_twenty"))

# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms 
fig.update_traces(opacity=0.75)
fig.update_layout(title='Histogram of Samples')
fig.show()

# Export method for embedding html into personal website
with open('hypot_histogram.html', 'w') as f:
    f.write(fig.to_html(include_plotlyjs='cdn'))

differences = df_new['last_twenty'] - df_new['first_twenty']
mean_difference = differences.mean()
print(f'The mean difference is: {mean_difference}')

The mean difference is: 8.4

## Control Chart 

# Calculate mean and standard deviation for both columns
before_mean = df_new['first_twenty'].mean()
before_std = df_new['first_twenty'].std()
after_mean = df_new['last_twenty'].mean()
after_std = df_new['last_twenty'].std()

# Create the control chart
plt.figure(figsize=(10, 6))

# Plot the before data
plt.plot(df_new.index, df_new['first_twenty'], 'o-', label='first_twenty')
plt.axhline(y=before_mean, color='b', linestyle='--', label='Before Mean')
plt.axhline(y=before_mean + before_std, color='b', linestyle=':', label='Before + 1 Std Dev')
plt.axhline(y=before_mean - before_std, color='b', linestyle=':', label='Before - 1 Std Dev')

# Plot the after data with a shift of 20 points in the x-axis
plt.plot(df_new.index + 20, df_new['last_twenty'], 's-', label='last_twenty')
plt.axhline(y=after_mean, color='orange', linestyle='--', label='After Mean')
plt.axhline(y=after_mean + after_std, color='orange', linestyle=':', label='After + 1 Std Dev')
plt.axhline(y=after_mean - after_std, color='orange', linestyle=':', label='After - 1 Std Dev')

# Set labels and title
plt.xlabel('Sample')
plt.ylabel('Sample Value')
plt.title('Control Chart (Before vs After)')
plt.legend()

# Rotate x-axis labels for better readability
plt.xticks(rotation=45)

# Adjust x-axis limits to accommodate both sets of data
plt.xlim(-2, df_new.index.max() + 22)  # Add a buffer on both sides

# Show the plot
plt.grid(True)
plt.tight_layout()
plt.show()

## Control Chart with anomaly detection
from sklearn.ensemble import IsolationForest

#Anomaly detection
isolation_method = IsolationForest(n_estimators=100, contamination=0.10)
# Model fitting
isolation_method.fit(pd.DataFrame(df['scores']))
df['anomaly_iso'] = isolation_method.predict(pd.DataFrame(df['scores']))
a = df.loc[df['anomaly_iso'] == -1, ['index', 'scores']]  # Anomaly


# Calculate mean and standard deviation for both columns
before_mean = df_new['first_twenty'].mean()
before_std = df_new['first_twenty'].std()
after_mean = df_new['last_twenty'].mean()
after_std = df_new['last_twenty'].std()

# Create the control chart
plt.figure(figsize=(10, 6))

# Plot the before data
plt.plot(df_new.index, df_new['first_twenty'], 'o-', label='first_twenty')
plt.axhline(y=before_mean, color='b', linestyle='--', label='Before Mean')
plt.axhline(y=before_mean + before_std, color='b', linestyle=':', label='Before + 1 Std Dev')
plt.axhline(y=before_mean - before_std, color='b', linestyle=':', label='Before - 1 Std Dev')

# Plot the after data with a shift of 20 points in the x-axis
plt.plot(df_new.index + 20, df_new['last_twenty'], 's-', label='last_twenty')
plt.axhline(y=after_mean, color='orange', linestyle='--', label='After Mean')
plt.axhline(y=after_mean + after_std, color='orange', linestyle=':', label='After + 1 Std Dev')
plt.axhline(y=after_mean - after_std, color='orange', linestyle=':', label='After - 1 Std Dev')

# Plot anomalies
plt.plot(a['index'], a['scores'], 'ro', label='Anomalies', markersize=15, alpha=0.2)

# Set labels and title
plt.xlabel('Sample')
plt.ylabel('Sample Value')
plt.title('Control Chart (Before vs After) with Anomalies')
plt.legend()

# Rotate x-axis labels for better readability
plt.xticks(rotation=45)

# Adjust x-axis limits to accommodate both sets of data
plt.xlim(-2, df_new.index.max() + 22)  # Add a buffer on both sides

# Show the plot
plt.grid(True)
plt.tight_layout()
plt.show()

alpha = 1 - 0.95

# Create a function to read the p-value
def p_value_reader(p_value, alpha):
    """ 
    Interpret the p-value in a statistical context
    """

    # Raise errors
    if not (0 <= p_value <= 1):
        raise ValueError("p_value must be between 0 and 1")
    if not (0 <= alpha <= 1):
        raise ValueError("threshold must be between 0 and 1")

    # Evaluate the p_value
    if p_value < alpha:
        print(f'p-value ({p_value}) is less than the threshold ({round(alpha, 2)}).')
        print("Evidence suggests rejecting the null hypothesis")
    else:
        print(f'p-value ({p_value}) is greater than or equal to the threshold ({round(alpha, 2)}).')
        print("Not enough evidence to reject the null hypothesis")

# Shapiro Wilks Test for normality
stat, p_value = st.shapiro(df_new['first_twenty'])
print(f'The p-value is {p_value}')

p_value_reader(p_value, alpha)

The p-value is 0.5092968959512759
p-value (0.5092968959512759) is greater than or equal to the threshold (0.05).
Not enough evidence to reject the null hypothesis

# Shapiro Wilks Test for normality
stat, p_value = st.shapiro(df_new['last_twenty'])
print(f'The p-value is {p_value}')

p_value_reader(p_value, alpha)

The p-value is 0.11828396201875863
p-value (0.11828396201875863) is greater than or equal to the threshold (0.05).
Not enough evidence to reject the null hypothesis

# Levene's Test for Equal Variance
levene_stat, pvalue = st.levene(first_twenty, last_twenty)
p_value_reader(pvalue, alpha=0.05)

p-value (0.9418139826323845) is greater than or equal to the threshold (0.05).
Not enough evidence to reject the null hypothesis

# Perform a paired t-test 
t_score, p_value = st.ttest_rel(a=df_new['first_twenty'], 
             b=df_new['last_twenty'],
             alternative='two-sided') # change this if the hypothesis is greater or less than. 

print(f'T-score: {t_score}')
p_value_reader(p_value, alpha=0.05)

T-score: -3.8227572324275547
p-value (0.0011485324742790328) is less than the threshold (0.05).
Evidence suggests rejecting the null hypothesis

# Perform t-test
t_score, p_value = st.ttest_ind(a=df_new['first_twenty'],
                                b=df_new['last_twenty'],
                                alternative='two-sided') # change this if the hypothesis is greater or less than.

print(f'T-score: {t_score}')
p_value_reader(p_value, alpha=0.05)

T-score: -4.033229685686957
p-value (0.0002558439327543572) is less than the threshold (0.05).
Evidence suggests rejecting the null hypothesis

# Perform t-test
t_score, p_value = st.ttest_ind(a=df_new['first_twenty'],
                                b=df_new['last_twenty'],
                                alternative='two-sided',
                                equal_var=False) # for unequal variance <------------------------------

print(f'T-score: {t_score}')
p_value_reader(p_value, alpha=0.05)

T-score: -4.033229685686958
p-value (0.0002562689392081225) is less than the threshold (0.05).
Evidence suggests rejecting the null hypothesis

	count	mean	std	min	25%	50%	75%	max
first_twenty	20.0	34.75	6.463460	25.0	29.75	34.5	38.25	50.0
last_twenty	20.0	43.15	6.706438	34.0	38.75	41.5	47.00	59.0

Hypothesis Test (2 Sample T-test)¶

Import data¶

Charting the data¶

Normality Test (Shapiro-Wilks Test)¶

Equal Variance Test (Levene's Test)¶

T-test¶

2 tailed paired T-test¶

2 tailed T-test (equal variance)¶

2 tailed T-test (unequal variance)¶

	first_twenty	last_twenty
0	30	49
1	29	44
2	35	35
3	29	34
4	29	41
5	38	47
6	38	38
7	34	45
8	25	36
9	39	42
10	40	42
11	36	41
12	33	39
13	30	41
14	26	59
15	35	40
16	41	49
17	46	47
18	32	57
19	50	37

	scores	index
0	30	0
1	29	1
2	35	2
3	29	3
4	29	4

	first_twenty	last_twenty
0	30	49
1	29	44
2	35	35
3	29	34
4	29	41
5	38	47
6	38	38
7	34	45
8	25	36
9	39	42
10	40	42
11	36	41
12	33	39
13	30	41
14	26	59
15	35	40
16	41	49
17	46	47
18	32	57
19	50	37

	first_twenty	last_twenty
0	30	49
1	29	44
2	35	35
3	29	34
4	29	41
5	38	47
6	38	38
7	34	45
8	25	36
9	39	42
10	40	42
11	36	41
12	33	39
13	30	41
14	26	59
15	35	40
16	41	49
17	46	47
18	32	57
19	50	37