import matplotlib.pyplot as plt
def rank_temperature_columns(headers):
'temperature': 10, # Exact match gets the highest score
'temp': 5, # Partial match gets a moderate score
lower_header = header.lower()
for keyword, keyword_score in keyword_scores.items():
if keyword in lower_header:
if score > 0: # Only include columns with a relevant score
ranked_columns.append((header, score))
# Sort columns based on the scores in descending order
ranked_columns.sort(key=lambda x: x[1], reverse=True)
def analyze_temperature_by_month_and_year(file_path, output_csv_path):
data = pd.read_csv(file_path)
# Display headers for verification
print("Available headers in the CSV file:")
print(list(data.columns))
# Rank potential temperature columns
ranked_columns = rank_temperature_columns(data.columns)
print("No column likely refers to temperature. Please check your CSV file.")
# Select the most probable temperature column
temperature_column = ranked_columns[0][0]
print(f"Using '{temperature_column}' as the temperature column. (Ranking: {ranked_columns})")
# Identify and handle the time-related column
if 'time' not in ''.join([col.lower() for col in data.columns]):
print("No suitable 'time' column found. Please ensure a time-related column exists.")
time_column = [col for col in data.columns if 'time' in col.lower()][0]
data[time_column] = pd.to_datetime(data[time_column])
# Extract year and month from the time column. This is robust to different day entries.
data['year'] = data[time_column].dt.year
data['month'] = data[time_column].dt.month
# Create a 'time' column with day=1 and hour=0 to represent the month
data['time'] = pd.to_datetime(dict(year=data['year'], month=data['month'], day=1, hour=0))
# Group data by the new 'time' column and calculate statistics
monthly_analysis = data.groupby('time')[temperature_column].agg(
median_temperature='median',
# Save results into a new CSV file
monthly_analysis.to_csv(output_csv_path, index=False)
print(f"Monthly temperature statistics saved to: {output_csv_path}")
# Create a pivot table for further analysis: each column will have the min, max, and median temperatures for each month
pivot_table = data.groupby(['year', 'month'])[temperature_column].agg(
median_temperature='median'
# Reshape the table so that months become columns with their stats split
reshaped_table = pivot_table.pivot(index='year', columns='month',
values=['min_temperature', 'max_temperature', 'median_temperature'])
# Optional: Rename columns for clarity (e.g., "Min_January", "Max_February")
reshaped_table.columns = [f"{stat}_{month}" for stat, month in reshaped_table.columns]
reshaped_table.reset_index(inplace=True)
# Save the pivot table into the second CSV file
reshaped_csv_path = output_csv_path.replace("_monthly_analysis", "_year_month_analysis")
reshaped_table.to_csv(reshaped_csv_path, index=False)
print(f"Pivot table saved to: {reshaped_csv_path}")
print(f"An error occurred: {e}")
def plot_temperature_by_month(file_path):
This function reads the CSV file (with potentially hourly entries), extracts the monthly aggregated data,
and then creates faceted plots. For each temperature statistic (min, max, median), it creates a figure with 12 subplots,
one for each month, where the x-axis is the year and the y-axis shows the temperature variable.
data = pd.read_csv(file_path)
# Identify temperature column using the ranking function
ranked_columns = rank_temperature_columns(data.columns)
print("No column likely refers to temperature. Please check your CSV file.")
temperature_column = ranked_columns[0][0]
print(f"Using '{temperature_column}' as the temperature column for plotting. (Ranking: {ranked_columns})")
# Identify the time column
time_column_candidates = [col for col in data.columns if 'time' in col.lower()]
if not time_column_candidates:
print("No suitable 'time' column found for plotting.")
time_column = time_column_candidates[0]
data[time_column] = pd.to_datetime(data[time_column])
# Extract year and month from the time column. This handles entries with different day values correctly.
data['year'] = data[time_column].dt.year
data['month'] = data[time_column].dt.month
# Group data by year and month and calculate statistics
monthly_stats = data.groupby(['year', 'month'])[temperature_column].agg(
median_temperature='median'
# Define the statistics to plot
stats = ['min_temperature', 'max_temperature', 'median_temperature']
# Create a faceted plot for each statistic (i.e., one figure per statistic with 12 subplots)
fig, axes = plt.subplots(3, 4, figsize=(15, 10), sharex=True, sharey=True)
fig.suptitle(f"{stat.replace('_', ' ').title()} by Year for Each Month", fontsize=16)
for ax, month in zip(axes.flatten(), range(1, 13)):
subset = monthly_stats[monthly_stats['month'] == month]
ax.plot(subset['year'], subset[stat], marker='o', linestyle='-')
ax.set_title(f"Month: {month}")
ax.set_ylabel(temperature_column)
fig.tight_layout(rect=[0, 0.03, 1, 0.95])
print(f"An error occurred while plotting: {e}")
file_path = "~/BR_VICOSA_TEMPERATURE_MONTHLY_1940-2024.csv"
output_csv_path = "BR_VICOSA_TEMPERATURE_1940-2024_monthly_analysis.csv"
# Analyze and export monthly statistics
analyze_temperature_by_month_and_year(file_path, output_csv_path)
# Create faceted plots for temperature statistics by month
plot_temperature_by_month(file_path)