Initial commit - SolSolAutomazione project

This commit is contained in:
dino
2026-02-10 10:53:06 +01:00
commit 78eeaf1364
11 changed files with 2261 additions and 0 deletions

View File

@@ -0,0 +1,340 @@
# Advanced Statistical Analysis for Pyranometer Data
# Implements research-backed statistical methods for solar irradiance analysis
# Load required libraries
library(dplyr)
library(tidyr)
library(lubridate)
library(forecast)
library(tseries)
library(nortest)
library(DescTools)
library(moments)
#' Calculate comprehensive statistical metrics for pyranometer data
#'
#' @param data Data frame with timestamp and sensor columns
#' @param reference_col Name of reference sensor column
#' @param test_cols Vector of test sensor column names
#' @param timestamp_col Name of timestamp column (default: "timestamp")
#' @return List containing comprehensive statistical analysis
calculate_advanced_metrics <- function(data, reference_col, test_cols, timestamp_col = "timestamp") {
results <- list()
for (test_col in test_cols) {
cat("Analyzing sensor:", test_col, "vs reference:", reference_col, "\n")
# Extract clean data (remove NAs)
clean_data <- data[complete.cases(data[[reference_col]], data[[test_col]]), ]
if (nrow(clean_data) < 10) {
warning(paste("Insufficient data for comparison:", test_col, "vs", reference_col))
next
}
reference <- clean_data[[reference_col]]
test <- clean_data[[test_col]]
# Basic descriptive statistics
basic_stats <- calculate_basic_statistics(test, reference)
# Error metrics
error_metrics <- calculate_error_metrics(test, reference)
# Distribution analysis
distribution_analysis <- analyze_distribution(test, reference)
# Time series analysis
time_series_analysis <- analyze_time_series(clean_data, test_col, reference_col, timestamp_col)
# Statistical tests
statistical_tests <- perform_statistical_tests(test, reference)
# Performance indicators
performance_indicators <- calculate_performance_indicators(test, reference)
results[[test_col]] <- list(
basic_statistics = basic_stats,
error_metrics = error_metrics,
distribution_analysis = distribution_analysis,
time_series_analysis = time_series_analysis,
statistical_tests = statistical_tests,
performance_indicators = performance_indicators,
sample_size = nrow(clean_data)
)
}
return(results)
}
#' Calculate basic descriptive statistics
calculate_basic_statistics <- function(test, reference) {
list(
test_mean = mean(test, na.rm = TRUE),
test_median = median(test, na.rm = TRUE),
test_sd = sd(test, na.rm = TRUE),
test_min = min(test, na.rm = TRUE),
test_max = max(test, na.rm = TRUE),
test_q1 = quantile(test, 0.25, na.rm = TRUE),
test_q3 = quantile(test, 0.75, na.rm = TRUE),
test_iqr = IQR(test, na.rm = TRUE),
test_skewness = moments::skewness(test, na.rm = TRUE),
test_kurtosis = moments::kurtosis(test, na.rm = TRUE),
reference_mean = mean(reference, na.rm = TRUE),
reference_median = median(reference, na.rm = TRUE),
reference_sd = sd(reference, na.rm = TRUE)
)
}
#' Calculate comprehensive error metrics
calculate_error_metrics <- function(test, reference) {
errors <- test - reference
list(
# Absolute errors
mbe = mean(errors, na.rm = TRUE), # Mean Bias Error
mae = mean(abs(errors), na.rm = TRUE), # Mean Absolute Error
rmse = sqrt(mean(errors^2, na.rm = TRUE)), # Root Mean Square Error
# Relative errors (percentage)
relative_mbe = (mean(errors, na.rm = TRUE) / mean(reference, na.rm = TRUE)) * 100,
relative_mae = (mean(abs(errors), na.rm = TRUE) / mean(reference, na.rm = TRUE)) * 100,
relative_rmse = (sqrt(mean(errors^2, na.rm = TRUE)) / mean(reference, na.rm = TRUE)) * 100,
# Advanced error metrics
mape = mean(abs(errors / reference), na.rm = TRUE) * 100, # Mean Absolute Percentage Error
smape = 100 * mean(2 * abs(errors) / (abs(test) + abs(reference)), na.rm = TRUE), # Symmetric MAPE
nrmse = sqrt(mean(errors^2, na.rm = TRUE)) / (max(reference, na.rm = TRUE) - min(reference, na.rm = TRUE)), # Normalized RMSE
# Theil's decomposition
theil_u = theils_u(test, reference),
# Error distribution
error_mean = mean(errors, na.rm = TRUE),
error_sd = sd(errors, na.rm = TRUE),
error_skewness = moments::skewness(errors, na.rm = TRUE),
error_kurtosis = moments::kurtosis(errors, na.rm = TRUE)
)
}
#' Calculate Theil's U statistic
theils_u <- function(actual, predicted) {
# Theil's U statistic for forecast accuracy
n <- length(actual)
numerator <- sqrt(sum((predicted - actual)^2) / n)
denominator <- sqrt(sum(actual^2) / n) + sqrt(sum(predicted^2) / n)
return(numerator / denominator)
}
#' Analyze distribution characteristics
analyze_distribution <- function(test, reference) {
list(
# Correlation analysis
pearson_correlation = cor(test, reference, method = "pearson"),
spearman_correlation = cor(test, reference, method = "spearman"),
kendall_correlation = cor(test, reference, method = "kendall"),
# Regression analysis
regression_slope = coef(lm(test ~ reference))[2],
regression_intercept = coef(lm(test ~ reference))[1],
r_squared = summary(lm(test ~ reference))$r.squared,
adjusted_r_squared = summary(lm(test ~ reference))$adj.r.squared,
# Distribution tests
test_normality_p = shapiro.test(test)$p.value,
reference_normality_p = shapiro.test(reference)$p.value,
errors_normality_p = shapiro.test(test - reference)$p.value
)
}
#' Perform comprehensive time series analysis
analyze_time_series <- function(data, test_col, reference_col, timestamp_col) {
# Ensure data is sorted by timestamp
data <- data[order(data[[timestamp_col]]), ]
test_ts <- data[[test_col]]
reference_ts <- data[[reference_col]]
# Calculate time differences
time_diffs <- diff(data[[timestamp_col]])
avg_interval <- mean(as.numeric(time_diffs, units = "secs"))
# Autocorrelation analysis
test_acf <- acf(test_ts, plot = FALSE)
reference_acf <- acf(reference_ts, plot = FALSE)
# Partial autocorrelation
test_pacf <- pacf(test_ts, plot = FALSE)
reference_pacf <- pacf(reference_ts, plot = FALSE)
# Cross-correlation
ccf_result <- ccf(test_ts, reference_ts, plot = FALSE)
# Stationarity tests
test_adf <- tryCatch(
adf.test(test_ts)$p.value,
error = function(e) NA
)
reference_adf <- tryCatch(
adf.test(reference_ts)$p.value,
error = function(e) NA
)
list(
sampling_interval_seconds = avg_interval,
test_autocorrelation_lag1 = test_acf$acf[2],
reference_autocorrelation_lag1 = reference_acf$acf[2],
cross_correlation_max = max(ccf_result$acf),
cross_correlation_lag = ccf_result$lag[which.max(ccf_result$acf)],
test_stationarity_p = test_adf,
reference_stationarity_p = reference_adf
)
}
#' Perform statistical hypothesis tests
perform_statistical_tests <- function(test, reference) {
errors <- test - reference
# Kolmogorov-Smirnov test for distribution similarity
ks_test <- tryCatch(
ks.test(test, reference),
error = function(e) list(statistic = NA, p.value = NA)
)
# Mann-Whitney U test (non-parametric)
mw_test <- tryCatch(
wilcox.test(test, reference),
error = function(e) list(statistic = NA, p.value = NA)
)
# Variance equality test (F-test)
var_test <- tryCatch(
var.test(test, reference),
error = function(e) list(statistic = NA, p.value = NA)
)
# Anderson-Darling test for normality of errors
ad_test <- tryCatch(
nortest::ad.test(errors),
error = function(e) list(statistic = NA, p.value = NA)
)
list(
ks_statistic = ks_test$statistic,
ks_p_value = ks_test$p.value,
mw_statistic = mw_test$statistic,
mw_p_value = mw_test$p.value,
f_statistic = var_test$statistic,
f_p_value = var_test$p.value,
ad_statistic = ad_test$statistic,
ad_p_value = ad_test$p.value
)
}
#' Calculate performance indicators specific to solar applications
calculate_performance_indicators <- function(test, reference) {
# Performance Ratio (simplified)
performance_ratio <- mean(test, na.rm = TRUE) / mean(reference, na.rm = TRUE)
# Clear Sky Index (simplified - assumes reference is clear sky)
clear_sky_index <- mean(test / reference, na.rm = TRUE)
# Variability index
variability_index <- sd(test, na.rm = TRUE) / mean(test, na.rm = TRUE)
# Consistency metrics
consistency_ratio <- 1 - (sd(test - reference, na.rm = TRUE) / mean(reference, na.rm = TRUE))
list(
performance_ratio = performance_ratio,
clear_sky_index = clear_sky_index,
variability_index = variability_index,
consistency_ratio = consistency_ratio
)
}
#' Generate comprehensive statistical report
generate_statistical_report <- function(analysis_results, output_file = NULL) {
report <- list()
for (sensor_name in names(analysis_results)) {
sensor_results <- analysis_results[[sensor_name]]
sensor_report <- list(
sensor_name = sensor_name,
sample_size = sensor_results$sample_size,
# Summary statistics
mean_bias_error = sensor_results$error_metrics$mbe,
root_mean_square_error = sensor_results$error_metrics$rmse,
mean_absolute_error = sensor_results$error_metrics$mae,
# Correlation and fit
r_squared = sensor_results$distribution_analysis$r_squared,
pearson_correlation = sensor_results$distribution_analysis$pearson_correlation,
# Statistical significance
ks_p_value = sensor_results$statistical_tests$ks_p_value,
mw_p_value = sensor_results$statistical_tests$mw_p_value,
# Performance indicators
performance_ratio = sensor_results$performance_indicators$performance_ratio,
variability_index = sensor_results$performance_indicators$variability_index
)
report[[sensor_name]] <- sensor_report
}
# Convert to data frame for easier viewing
report_df <- do.call(rbind, lapply(report, as.data.frame))
if (!is.null(output_file)) {
write.csv(report_df, output_file, row.names = FALSE)
cat("Statistical report saved to:", output_file, "\n")
}
return(report_df)
}
# Example usage and testing
if (FALSE) {
# Create sample data
set.seed(123)
n <- 1000
timestamp <- seq(as.POSIXct("2024-01-01 06:00:00"), by = "1 min", length.out = n)
# Simulate reference sensor (clear sky conditions with some noise)
reference <- 800 + 200 * sin(seq(0, 2*pi, length.out = n)) + rnorm(n, 0, 20)
# Simulate test sensors with different characteristics
sensor1 <- reference + rnorm(n, 5, 15) # Small bias, good precision
sensor2 <- reference * 1.02 + rnorm(n, 0, 30) # 2% overestimation, poorer precision
sensor3 <- reference + rnorm(n, -10, 25) # Negative bias
sample_data <- data.frame(
timestamp = timestamp,
reference_sensor = reference,
test_sensor_1 = sensor1,
test_sensor_2 = sensor2,
test_sensor_3 = sensor3
)
# Run analysis
results <- calculate_advanced_metrics(
data = sample_data,
reference_col = "reference_sensor",
test_cols = c("test_sensor_1", "test_sensor_2", "test_sensor_3")
)
# Generate report
report <- generate_statistical_report(results, "sensor_comparison_report.csv")
print("Advanced statistical analysis completed!")
print(report)
}

View File

@@ -0,0 +1,432 @@
# Uncertainty Calculation Engine for Pyranometer Data
# Implements ISO GUM (Guide to the Expression of Uncertainty in Measurement) standards
# and advanced uncertainty analysis methods for solar irradiance measurements
# Load required libraries
library(dplyr)
library(tidyr)
library(moments)
library(boot)
#' Calculate comprehensive measurement uncertainty for pyranometer data
#'
#' @param data Data frame with sensor measurements
#' @param reference_col Name of reference sensor column
#' @param test_cols Vector of test sensor column names
#' @param sensor_specs List containing sensor specifications and calibration data
#' @param coverage_factor Coverage factor for expanded uncertainty (default: 2 for 95% confidence)
#' @return List containing comprehensive uncertainty analysis
calculate_measurement_uncertainty <- function(data, reference_col, test_cols,
sensor_specs = NULL, coverage_factor = 2) {
results <- list()
for (test_col in test_cols) {
cat("Calculating uncertainty for sensor:", test_col, "\n")
# Extract clean data
clean_data <- data[complete.cases(data[[reference_col]], data[[test_col]]), ]
if (nrow(clean_data) < 10) {
warning(paste("Insufficient data for uncertainty analysis:", test_col))
next
}
reference <- clean_data[[reference_col]]
test <- clean_data[[test_col]]
errors <- test - reference
# Type A uncertainty (statistical analysis of measurements)
type_a_uncertainty <- calculate_type_a_uncertainty(errors, test, reference)
# Type B uncertainty (systematic uncertainties from specifications)
type_b_uncertainty <- calculate_type_b_uncertainty(test_col, sensor_specs, mean(reference))
# Combined standard uncertainty
combined_uncertainty <- calculate_combined_uncertainty(type_a_uncertainty, type_b_uncertainty)
# Expanded uncertainty
expanded_uncertainty <- calculate_expanded_uncertainty(combined_uncertainty, coverage_factor)
# Uncertainty budget
uncertainty_budget <- create_uncertainty_budget(type_a_uncertainty, type_b_uncertainty,
combined_uncertainty, expanded_uncertainty)
# Monte Carlo simulation for complex uncertainty propagation
monte_carlo_results <- perform_monte_carlo_simulation(test, reference, sensor_specs, test_col)
results[[test_col]] <- list(
type_a_uncertainty = type_a_uncertainty,
type_b_uncertainty = type_b_uncertainty,
combined_uncertainty = combined_uncertainty,
expanded_uncertainty = expanded_uncertainty,
uncertainty_budget = uncertainty_budget,
monte_carlo_simulation = monte_carlo_results,
coverage_factor = coverage_factor,
sample_size = nrow(clean_data)
)
}
return(results)
}
#' Calculate Type A uncertainty (statistical analysis)
calculate_type_a_uncertainty <- function(errors, test, reference) {
n <- length(errors)
# Standard uncertainty from repeated measurements
standard_uncertainty <- sd(errors, na.rm = TRUE) / sqrt(n)
# Uncertainty of the mean
uncertainty_of_mean <- sd(test, na.rm = TRUE) / sqrt(n)
# Regression uncertainty (if using regression model)
regression_uncertainty <- calculate_regression_uncertainty(test, reference)
# Autocorrelation-corrected uncertainty
autocorr_uncertainty <- calculate_autocorrelation_uncertainty(errors)
list(
standard_uncertainty = standard_uncertainty,
uncertainty_of_mean = uncertainty_of_mean,
regression_uncertainty = regression_uncertainty,
autocorrelation_uncertainty = autocorr_uncertainty,
standard_deviation = sd(errors, na.rm = TRUE),
standard_error = sd(errors, na.rm = TRUE) / sqrt(n),
degrees_of_freedom = n - 1
)
}
#' Calculate Type B uncertainty (systematic uncertainties)
calculate_type_b_uncertainty <- function(sensor_name, sensor_specs, mean_irradiance) {
# Default uncertainties if no specifications provided
default_uncertainties <- list(
calibration_uncertainty = 0.02, # 2% calibration uncertainty
temperature_uncertainty = 0.005, # 0.5% per °C temperature effect
cosine_uncertainty = 0.01, # 1% cosine response uncertainty
azimuth_uncertainty = 0.002, # 0.2% azimuth uncertainty
spectral_uncertainty = 0.015, # 1.5% spectral mismatch
linearity_uncertainty = 0.005, # 0.5% non-linearity
stability_uncertainty = 0.01, # 1% long-term stability
resolution_uncertainty = 0.001 # 0.1% resolution uncertainty
)
# Use sensor-specific specifications if available
if (!is.null(sensor_specs) && sensor_name %in% names(sensor_specs)) {
specs <- sensor_specs[[sensor_name]]
uncertainties <- modifyList(default_uncertainties, specs)
} else {
uncertainties <- default_uncertainties
}
# Convert relative uncertainties to absolute (W/m²)
absolute_uncertainties <- lapply(uncertainties, function(u) u * mean_irradiance)
# Calculate combined Type B uncertainty
type_b_combined <- sqrt(sum(sapply(absolute_uncertainties, function(x) x^2)))
list(
relative_uncertainties = uncertainties,
absolute_uncertainties = absolute_uncertainties,
combined_uncertainty = type_b_combined,
effective_degrees_of_freedom = Inf # Type B uncertainties typically have infinite df
)
}
#' Calculate regression-related uncertainty
calculate_regression_uncertainty <- function(test, reference) {
# Fit linear regression model
model <- lm(test ~ reference)
# Standard error of the regression
regression_se <- summary(model)$sigma
# Prediction interval uncertainty
n <- length(test)
prediction_uncertainty <- regression_se * sqrt(1 + 1/n)
list(
regression_standard_error = regression_se,
prediction_uncertainty = prediction_uncertainty,
residual_standard_error = sd(residuals(model))
)
}
#' Calculate uncertainty corrected for autocorrelation
calculate_autocorrelation_uncertainty <- function(errors) {
n <- length(errors)
# Calculate autocorrelation at lag 1
acf_result <- acf(errors, plot = FALSE, lag.max = 1)
rho <- acf_result$acf[2]
# Effective sample size considering autocorrelation
effective_n <- n * (1 - rho) / (1 + rho)
# Autocorrelation-corrected uncertainty
corrected_uncertainty <- sd(errors, na.rm = TRUE) / sqrt(effective_n)
list(
autocorrelation_coefficient = rho,
effective_sample_size = effective_n,
corrected_uncertainty = corrected_uncertainty
)
}
#' Calculate combined standard uncertainty
calculate_combined_uncertainty <- function(type_a, type_b) {
# Combine Type A and Type B uncertainties
combined <- sqrt(type_a$standard_uncertainty^2 + type_b$combined_uncertainty^2)
# Calculate effective degrees of freedom using Welch-Satterthwaite formula
u_a <- type_a$standard_uncertainty
u_b <- type_b$combined_uncertainty
df_a <- type_a$degrees_of_freedom
df_b <- type_b$effective_degrees_of_freedom
effective_df <- (combined^4) / ((u_a^4 / df_a) + (u_b^4 / df_b))
list(
combined_standard_uncertainty = combined,
effective_degrees_of_freedom = effective_df,
coverage_factor_95 = qt(0.975, effective_df)
)
}
#' Calculate expanded uncertainty
calculate_expanded_uncertainty <- function(combined_uncertainty, coverage_factor = 2) {
expanded <- coverage_factor * combined_uncertainty$combined_standard_uncertainty
list(
expanded_uncertainty = expanded,
coverage_factor = coverage_factor,
confidence_level = ifelse(coverage_factor == 2, 0.9545,
ifelse(coverage_factor == 1.96, 0.95, NA))
)
}
#' Create detailed uncertainty budget
create_uncertainty_budget <- function(type_a, type_b, combined, expanded) {
budget <- list(
type_a_components = list(
standard_uncertainty = type_a$standard_uncertainty,
uncertainty_of_mean = type_a$uncertainty_of_mean,
regression_uncertainty = type_a$regression_uncertainty$regression_standard_error,
autocorrelation_uncertainty = type_a$autocorrelation_uncertainty$corrected_uncertainty
),
type_b_components = type_b$absolute_uncertainties,
summary = list(
combined_standard_uncertainty = combined$combined_standard_uncertainty,
expanded_uncertainty = expanded$expanded_uncertainty,
coverage_factor = expanded$coverage_factor,
effective_degrees_of_freedom = combined$effective_degrees_of_freedom
)
)
return(budget)
}
#' Perform Monte Carlo simulation for complex uncertainty propagation
perform_monte_carlo_simulation <- function(test, reference, sensor_specs, sensor_name,
n_simulations = 10000) {
n <- length(test)
mean_test <- mean(test, na.rm = TRUE)
mean_reference <- mean(reference, na.rm = TRUE)
# Get sensor specifications
if (!is.null(sensor_specs) && sensor_name %in% names(sensor_specs)) {
specs <- sensor_specs[[sensor_name]]
} else {
specs <- list(
calibration_uncertainty = 0.02,
temperature_uncertainty = 0.005,
cosine_uncertainty = 0.01
)
}
# Initialize simulation results
simulated_biases <- numeric(n_simulations)
for (i in 1:n_simulations) {
# Simulate random errors for each uncertainty component
cal_error <- rnorm(1, 0, specs$calibration_uncertainty * mean_reference)
temp_error <- rnorm(1, 0, specs$temperature_uncertainty * mean_reference)
cosine_error <- rnorm(1, 0, specs$cosine_uncertainty * mean_reference)
# Simulate random measurement errors
measurement_error <- rnorm(1, 0, sd(test - reference, na.rm = TRUE))
# Total simulated bias
total_error <- cal_error + temp_error + cosine_error + measurement_error
simulated_biases[i] <- total_error
}
# Calculate statistics from simulation
simulation_stats <- list(
mean_bias = mean(simulated_biases),
sd_bias = sd(simulated_biases),
quantile_2.5 = quantile(simulated_biases, 0.025),
quantile_97.5 = quantile(simulated_biases, 0.975),
simulation_size = n_simulations
)
return(simulation_stats)
}
#' Generate uncertainty report
generate_uncertainty_report <- function(uncertainty_results, output_file = NULL) {
report <- list()
for (sensor_name in names(uncertainty_results)) {
sensor_results <- uncertainty_results[[sensor_name]]
sensor_report <- list(
sensor_name = sensor_name,
sample_size = sensor_results$sample_size,
# Key uncertainty metrics
combined_standard_uncertainty = sensor_results$combined_uncertainty$combined_standard_uncertainty,
expanded_uncertainty = sensor_results$expanded_uncertainty$expanded_uncertainty,
coverage_factor = sensor_results$expanded_uncertainty$coverage_factor,
# Type A components
type_a_standard_uncertainty = sensor_results$type_a_uncertainty$standard_uncertainty,
type_a_degrees_of_freedom = sensor_results$type_a_uncertainty$degrees_of_freedom,
# Type B components
type_b_combined_uncertainty = sensor_results$type_b_uncertainty$combined_uncertainty,
# Monte Carlo results
monte_carlo_mean_bias = sensor_results$monte_carlo_simulation$mean_bias,
monte_carlo_uncertainty = sensor_results$monte_carlo_simulation$sd_bias
)
report[[sensor_name]] <- sensor_report
}
# Convert to data frame
report_df <- do.call(rbind, lapply(report, as.data.frame))
if (!is.null(output_file)) {
write.csv(report_df, output_file, row.names = FALSE)
cat("Uncertainty report saved to:", output_file, "\n")
}
return(report_df)
}
#' Calculate measurement uncertainty following ISO GUM
#'
#' This function implements the complete ISO GUM uncertainty calculation procedure
#' @param measurement_result The measured value
#' @param uncertainty_components List of uncertainty components with their values and types
#' @param coverage_probability Desired coverage probability (default: 0.95)
#' @return List containing the complete uncertainty analysis
calculate_iso_gum_uncertainty <- function(measurement_result, uncertainty_components,
coverage_probability = 0.95) {
# Step 1: Identify all uncertainty sources
sources <- names(uncertainty_components)
# Step 2: Quantify uncertainty components
standard_uncertainties <- sapply(uncertainty_components, function(comp) {
if (comp$type == "A") {
# Type A: statistical analysis
comp$value / sqrt(comp$n) # Standard error of the mean
} else if (comp$type == "B") {
# Type B: rectangular distribution assumed unless specified
if (!is.null(comp$distribution) && comp$distribution == "normal") {
comp$value
} else {
comp$value / sqrt(3) # Rectangular distribution
}
} else {
comp$value # Default: treat as standard uncertainty
}
})
# Step 3: Calculate combined standard uncertainty
combined_uncertainty <- sqrt(sum(standard_uncertainties^2))
# Step 4: Calculate expanded uncertainty
# For large degrees of freedom, use coverage factor 2 for 95% confidence
expanded_uncertainty <- 2 * combined_uncertainty
list(
measurement_result = measurement_result,
standard_uncertainties = standard_uncertainties,
combined_standard_uncertainty = combined_uncertainty,
expanded_uncertainty = expanded_uncertainty,
coverage_factor = 2,
coverage_probability = coverage_probability,
uncertainty_budget = data.frame(
source = sources,
standard_uncertainty = standard_uncertainties
)
)
}
# Example usage and testing
if (FALSE) {
# Create sample data
set.seed(123)
n <- 500
timestamp <- seq(as.POSIXct("2024-01-01 06:00:00"), by = "1 min", length.out = n)
# Simulate reference and test sensors
reference <- 800 + 200 * sin(seq(0, 2*pi, length.out = n)) + rnorm(n, 0, 10)
test_sensor <- reference + rnorm(n, 5, 15) # 5 W/m² bias, 15 W/m² random error
sample_data <- data.frame(
timestamp = timestamp,
reference_sensor = reference,
test_sensor = test_sensor
)
# Define sensor specifications
sensor_specs <- list(
test_sensor = list(
calibration_uncertainty = 0.02, # 2%
temperature_uncertainty = 0.005, # 0.5%
cosine_uncertainty = 0.01, # 1%
spectral_uncertainty = 0.015 # 1.5%
)
)
# Calculate uncertainty
uncertainty_results <- calculate_measurement_uncertainty(
data = sample_data,
reference_col = "reference_sensor",
test_cols = "test_sensor",
sensor_specs = sensor_specs,
coverage_factor = 2
)
# Generate report
report <- generate_uncertainty_report(uncertainty_results, "uncertainty_report.csv")
print("Uncertainty analysis completed!")
print(report)
# Example of ISO GUM calculation
gum_components <- list(
calibration = list(type = "B", value = 16, distribution = "normal"), # 2% of 800 W/m²
temperature = list(type = "B", value = 4), # 0.5% of 800 W/m²
repeatability = list(type = "A", value = 15, n = 500)
)
gum_result <- calculate_iso_gum_uncertainty(805, gum_components)
print("ISO GUM uncertainty calculation:")
print(gum_result)
}

View File

@@ -0,0 +1,342 @@
#!/usr/bin/env python3
"""
Pyranometer Data Validator
Implements quality control and data cleaning for solar irradiance measurements
Based on BSRN quality control procedures and research standards
"""
import pandas as pd
import numpy as np
from typing import Dict, List, Tuple, Optional
import logging
from datetime import datetime, timedelta
logger = logging.getLogger(__name__)
class PyranometerDataValidator:
"""Validate and clean pyranometer data using scientific quality control procedures"""
def __init__(self):
# Physical limits based on BSRN standards and research
self.physical_limits = {
'irradiance': {
'min': 0, # W/m² (can be negative for some sensors due to thermal effects)
'max': 1500, # W/m² (theoretical maximum for clear sky conditions)
'extremely_rare_max': 1400 # W/m² (rare but possible in high altitude)
},
'temperature': {
'min': -50, # °C
'max': 80, # °C
'extremely_rare_min': -40,
'extremely_rare_max': 70
},
'rate_of_change': {
'irradiance': 100, # W/m² per minute (maximum plausible rate)
'temperature': 5 # °C per minute
}
}
# Sensor-specific characteristics
self.sensor_specs = {
'kipp_zonen': {
'response_time': 5, # seconds
'uncertainty': 2.0, # % at k=2
'cosine_error': 1.0 # %
},
'apogee': {
'response_time': 1, # seconds
'uncertainty': 5.0, # % at k=2
'cosine_error': 2.0 # %
},
'hukseflux': {
'response_time': 1, # seconds
'uncertainty': 2.5, # % at k=2
'cosine_error': 1.5 # %
},
'generic': {
'response_time': 10, # seconds
'uncertainty': 10.0, # % at k=2
'cosine_error': 5.0 # %
}
}
def apply_physical_limits_check(self, data: pd.DataFrame, sensor_info: Dict) -> pd.DataFrame:
"""Apply BSRN physical limits quality control"""
df = data.copy()
for sensor in sensor_info['sensors']:
col = sensor['column_name']
data_type = sensor['data_type']
if data_type == 'irradiance':
# Physical possible limits
mask_physical = (df[col] < self.physical_limits['irradiance']['min']) | \
(df[col] > self.physical_limits['irradiance']['max'])
# Extremely rare limits
mask_extreme = df[col] > self.physical_limits['irradiance']['extremely_rare_max']
# Add quality flags
df[f'{col}_quality_flag'] = 0 # Default: good
df.loc[mask_extreme, f'{col}_quality_flag'] = 1 # Warning
df.loc[mask_physical, f'{col}_quality_flag'] = 2 # Error
logger.info(f"Physical limits check for {col}: "
f"{mask_physical.sum()} errors, {mask_extreme.sum()} warnings")
elif data_type == 'temperature':
mask_physical = (df[col] < self.physical_limits['temperature']['min']) | \
(df[col] > self.physical_limits['temperature']['max'])
mask_extreme = (df[col] < self.physical_limits['temperature']['extremely_rare_min']) | \
(df[col] > self.physical_limits['temperature']['extremely_rare_max'])
df[f'{col}_quality_flag'] = 0
df.loc[mask_extreme, f'{col}_quality_flag'] = 1
df.loc[mask_physical, f'{col}_quality_flag'] = 2
logger.info(f"Temperature limits check for {col}: "
f"{mask_physical.sum()} errors, {mask_extreme.sum()} warnings")
return df
def detect_outliers_iqr(self, data: pd.DataFrame, sensor_info: Dict) -> pd.DataFrame:
"""Detect outliers using Interquartile Range method"""
df = data.copy()
for sensor in sensor_info['sensors']:
if sensor['data_type'] == 'irradiance':
col = sensor['column_name']
Q1 = df[col].quantile(0.25)
Q3 = df[col].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = (df[col] < lower_bound) | (df[col] > upper_bound)
df[f'{col}_outlier'] = outliers
logger.info(f"IQR outlier detection for {col}: {outliers.sum()} outliers found")
return df
def check_rate_of_change(self, data: pd.DataFrame, sensor_info: Dict) -> pd.DataFrame:
"""Check for unrealistic rate of change in measurements"""
df = data.copy()
# Ensure data is sorted by timestamp
df = df.sort_values('timestamp').reset_index(drop=True)
for sensor in sensor_info['sensors']:
col = sensor['column_name']
data_type = sensor['data_type']
if data_type == 'irradiance':
# Calculate rate of change (W/m² per minute)
time_diff = df['timestamp'].diff().dt.total_seconds() / 60 # minutes
value_diff = df[col].diff()
rate_of_change = value_diff / time_diff
# Replace infinite values
rate_of_change = rate_of_change.replace([np.inf, -np.inf], np.nan)
# Flag unrealistic rates
max_rate = self.physical_limits['rate_of_change']['irradiance']
rate_outliers = rate_of_change.abs() > max_rate
df[f'{col}_rate_flag'] = rate_outliers
logger.info(f"Rate of change check for {col}: {rate_outliers.sum()} flags")
elif data_type == 'temperature':
time_diff = df['timestamp'].diff().dt.total_seconds() / 60
value_diff = df[col].diff()
rate_of_change = value_diff / time_diff
rate_of_change = rate_of_change.replace([np.inf, -np.inf], np.nan)
max_rate = self.physical_limits['rate_of_change']['temperature']
rate_outliers = rate_of_change.abs() > max_rate
df[f'{col}_rate_flag'] = rate_outliers
logger.info(f"Temperature rate check for {col}: {rate_outliers.sum()} flags")
return df
def detect_missing_data(self, data: pd.DataFrame, sensor_info: Dict) -> pd.DataFrame:
"""Detect missing or invalid data points"""
df = data.copy()
for sensor in sensor_info['sensors']:
col = sensor['column_name']
# Check for NaN values
nan_mask = df[col].isna()
# Check for constant values (potential sensor failure)
if len(df) > 10: # Need sufficient data
rolling_std = df[col].rolling(window=10, min_periods=1).std()
constant_mask = rolling_std == 0
else:
constant_mask = pd.Series(False, index=df.index)
df[f'{col}_missing_flag'] = nan_mask | constant_mask
logger.info(f"Missing data check for {col}: "
f"{nan_mask.sum()} NaN, {constant_mask.sum()} constant values")
return df
def apply_sun_position_checks(self, data: pd.DataFrame, location: Dict = None) -> pd.DataFrame:
"""Apply sun position-based quality checks (simplified version)"""
df = data.copy()
# Simplified: check for nighttime values that should be zero
# In a full implementation, you would calculate solar position
df['hour'] = df['timestamp'].dt.hour
# Assume nighttime is between 8 PM and 5 AM (simplified)
nighttime_mask = (df['hour'] >= 20) | (df['hour'] < 5)
# For irradiance sensors, nighttime values should be near zero
irradiance_cols = [sensor['column_name'] for sensor in
[s for s in sensor_info['sensors'] if s['data_type'] == 'irradiance']]
for col in irradiance_cols:
nighttime_high = nighttime_mask & (df[col] > 50) # More than 50 W/m² at night
df[f'{col}_night_flag'] = nighttime_high
logger.info(f"Nighttime check for {col}: {nighttime_high.sum()} flags")
return df
def clean_data(self, data: pd.DataFrame, sensor_info: Dict,
cleaning_strategy: str = 'conservative') -> pd.DataFrame:
"""Apply comprehensive data cleaning based on selected strategy"""
df = data.copy()
logger.info("Starting data cleaning process...")
# Apply all validation checks
df = self.apply_physical_limits_check(df, sensor_info)
df = self.detect_outliers_iqr(df, sensor_info)
df = self.check_rate_of_change(df, sensor_info)
df = self.detect_missing_data(df, sensor_info)
# Apply cleaning based on strategy
if cleaning_strategy == 'conservative':
# Only remove clear errors
for sensor in sensor_info['sensors']:
col = sensor['column_name']
error_mask = df[f'{col}_quality_flag'] == 2
df.loc[error_mask, col] = np.nan
logger.info(f"Conservative cleaning for {col}: {error_mask.sum()} points removed")
elif cleaning_strategy == 'moderate':
# Remove errors and some warnings
for sensor in sensor_info['sensors']:
col = sensor['column_name']
remove_mask = (df[f'{col}_quality_flag'] == 2) | \
(df[f'{col}_outlier']) | \
(df[f'{col}_rate_flag']) | \
(df[f'{col}_missing_flag'])
df.loc[remove_mask, col] = np.nan
logger.info(f"Moderate cleaning for {col}: {remove_mask.sum()} points removed")
elif cleaning_strategy == 'aggressive':
# Remove all flagged data
for sensor in sensor_info['sensors']:
col = sensor['column_name']
remove_mask = (df[f'{col}_quality_flag'] > 0) | \
(df[f'{col}_outlier']) | \
(df[f'{col}_rate_flag']) | \
(df[f'{col}_missing_flag'])
df.loc[remove_mask, col] = np.nan
logger.info(f"Aggressive cleaning for {col}: {remove_mask.sum()} points removed")
# Calculate data quality statistics
quality_stats = self.calculate_quality_statistics(df, sensor_info)
logger.info("Data cleaning completed")
logger.info(f"Quality statistics: {quality_stats}")
return df, quality_stats
def calculate_quality_statistics(self, data: pd.DataFrame, sensor_info: Dict) -> Dict:
"""Calculate comprehensive quality statistics"""
stats = {}
for sensor in sensor_info['sensors']:
col = sensor['column_name']
total_points = len(data)
if total_points == 0:
continue
# Count different flag types
quality_flag_col = f'{col}_quality_flag'
if quality_flag_col in data.columns:
error_count = (data[quality_flag_col] == 2).sum()
warning_count = (data[quality_flag_col] == 1).sum()
good_count = (data[quality_flag_col] == 0).sum()
else:
error_count = warning_count = 0
good_count = total_points
outlier_count = data.get(f'{col}_outlier', pd.Series(False)).sum()
rate_flag_count = data.get(f'{col}_rate_flag', pd.Series(False)).sum()
missing_count = data.get(f'{col}_missing_flag', pd.Series(False)).sum()
# Calculate percentages
stats[col] = {
'total_points': total_points,
'good_points': good_count,
'warning_points': warning_count,
'error_points': error_count,
'outlier_points': outlier_count,
'rate_flag_points': rate_flag_count,
'missing_points': missing_count,
'data_quality_percentage': (good_count / total_points * 100) if total_points > 0 else 0,
'completeness_percentage': ((total_points - missing_count) / total_points * 100) if total_points > 0 else 0
}
return stats
# Example usage
if __name__ == "__main__":
# Create sample data for testing
sample_data = {
'timestamp': pd.date_range('2024-01-01 06:00:00', '2024-01-01 18:00:00', freq='1min'),
'sensor_1': np.random.normal(800, 50, 721),
'sensor_2': np.random.normal(810, 60, 721),
'temperature': np.random.normal(25, 2, 721)
}
# Add some outliers and errors
sample_data['sensor_1'][100] = 2000 # Physical limit error
sample_data['sensor_2'][200] = -100 # Physical limit error
sample_data['temperature'][300] = 100 # Temperature error
df = pd.DataFrame(sample_data)
sensor_info = {
'sensors': [
{'column_name': 'sensor_1', 'data_type': 'irradiance'},
{'column_name': 'sensor_2', 'data_type': 'irradiance'},
{'column_name': 'temperature', 'data_type': 'temperature'}
]
}
validator = PyranometerDataValidator()
# Test validation
cleaned_data, quality_stats = validator.clean_data(df, sensor_info, 'moderate')
print("Data validation completed!")
print(f"Original data points: {len(df)}")
print(f"Cleaned data points: {len(cleaned_data)}")
print("Quality statistics:")
for sensor, stats in quality_stats.items():
print(f" {sensor}: {stats['data_quality_percentage']:.1f}% good data")

View File

@@ -0,0 +1,308 @@
#!/usr/bin/env python3
"""
Pyranometer Data File Processor
Handles CSV and TXT files from various dataloggers and sensor types
"""
import pandas as pd
import numpy as np
import os
import re
import json
from datetime import datetime, timedelta
from pathlib import Path
import logging
from typing import Dict, List, Optional, Tuple
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
class PyranometerFileProcessor:
"""Process pyranometer data files from various formats and sensors"""
def __init__(self, config_path: Optional[str] = None):
self.supported_formats = ['.csv', '.txt', '.dat']
self.sensor_patterns = {
'kipp_zonen': r'(CMP|SMP|SP)[\d]+',
'apogee': r'(SP|SQ)[\d]+',
'hukseflux': r'(SR)[\d]+',
'li_cor': r'(LI-200|LI-190)',
'eppley': r'(PSP|EPPLEY)'
}
def detect_file_format(self, file_path: str) -> str:
"""Detect the format of the data file"""
file_ext = Path(file_path).suffix.lower()
if file_ext == '.csv':
return 'csv'
elif file_ext == '.txt':
# Check if it's CSV-like or custom format
with open(file_path, 'r') as f:
first_line = f.readline().strip()
if ',' in first_line:
return 'csv_txt'
else:
return 'custom_txt'
else:
return 'unknown'
def parse_timestamp(self, timestamp_str: str, file_format: str) -> datetime:
"""Parse timestamp from various formats"""
timestamp_formats = [
'%Y-%m-%d %H:%M:%S',
'%d/%m/%Y %H:%M:%S',
'%m/%d/%Y %H:%M:%S',
'%Y-%m-%dT%H:%M:%S',
'%Y%m%d_%H%M%S',
'%H:%M:%S %d/%m/%Y'
]
for fmt in timestamp_formats:
try:
return datetime.strptime(timestamp_str.strip(), fmt)
except ValueError:
continue
# If no format matches, try to extract from string
logger.warning(f"Could not parse timestamp: {timestamp_str}")
return datetime.now()
def detect_sensor_type(self, column_name: str) -> Tuple[str, str]:
"""Detect sensor type and model from column name"""
column_lower = column_name.lower()
# Check for common sensor patterns
for sensor_type, pattern in self.sensor_patterns.items():
if re.search(pattern, column_name, re.IGNORECASE):
return sensor_type, re.search(pattern, column_name, re.IGNORECASE).group()
# Check for generic patterns
if any(word in column_lower for word in ['irradiance', 'ghi', 'global']):
return 'generic', 'irradiance_sensor'
elif any(word in column_lower for word in ['temp', 'temperature']):
return 'generic', 'temperature_sensor'
elif any(word in column_lower for word in ['volt', 'voltage']):
return 'generic', 'voltage_sensor'
else:
return 'unknown', 'unknown'
def process_csv_file(self, file_path: str) -> Dict:
"""Process CSV file with pyranometer data"""
try:
# Try different encodings
encodings = ['utf-8', 'latin-1', 'windows-1252']
df = None
for encoding in encodings:
try:
df = pd.read_csv(file_path, encoding=encoding)
break
except UnicodeDecodeError:
continue
if df is None:
raise ValueError("Could not read file with any encoding")
# Clean column names
df.columns = [col.strip().lower().replace(' ', '_') for col in df.columns]
# Identify timestamp column
timestamp_cols = [col for col in df.columns if any(word in col for word in
['time', 'date', 'timestamp', 'datetime'])]
if not timestamp_cols:
raise ValueError("No timestamp column found")
timestamp_col = timestamp_cols[0]
# Parse timestamps
df['timestamp'] = df[timestamp_col].apply(
lambda x: self.parse_timestamp(str(x), 'csv')
)
# Identify data columns
irradiance_cols = [col for col in df.columns if any(word in col for word in
['irradiance', 'ghi', 'global', 'w/m2', 'w_m2'])]
temperature_cols = [col for col in df.columns if any(word in col for word in
['temp', 'temperature', '°c', 'deg'])]
voltage_cols = [col for col in df.columns if any(word in col for word in
['volt', 'voltage', 'mv', 'v'])]
# Extract sensor information
sensors = []
for col in irradiance_cols:
sensor_type, model = self.detect_sensor_type(col)
sensors.append({
'column_name': col,
'sensor_type': sensor_type,
'model': model,
'data_type': 'irradiance',
'units': 'W/m²'
})
for col in temperature_cols:
sensor_type, model = self.detect_sensor_type(col)
sensors.append({
'column_name': col,
'sensor_type': sensor_type,
'model': model,
'data_type': 'temperature',
'units': '°C'
})
for col in voltage_cols:
sensor_type, model = self.detect_sensor_type(col)
sensors.append({
'column_name': col,
'sensor_type': sensor_type,
'model': model,
'data_type': 'voltage',
'units': 'V'
})
result = {
'file_path': file_path,
'file_format': 'csv',
'timestamp_column': timestamp_col,
'sensors': sensors,
'data': df,
'start_time': df['timestamp'].min(),
'end_time': df['timestamp'].max(),
'sampling_interval': self.calculate_sampling_interval(df['timestamp']),
'total_measurements': len(df)
}
logger.info(f"Processed CSV file: {file_path}")
logger.info(f"Found {len(sensors)} sensors, {len(df)} measurements")
return result
except Exception as e:
logger.error(f"Error processing CSV file {file_path}: {str(e)}")
raise
def process_txt_file(self, file_path: str) -> Dict:
"""Process TXT file with pyranometer data"""
try:
# Read file to determine format
with open(file_path, 'r') as f:
lines = f.readlines()
# Check if it's CSV-like format
if ',' in lines[0]:
return self.process_csv_file(file_path)
# Handle custom TXT format (space or tab separated)
# This is a simplified version - you may need to customize based on your specific format
df = pd.read_csv(file_path, sep='\s+', engine='python')
# Similar processing as CSV
df.columns = [f'col_{i}' for i in range(len(df.columns))]
# Assume first column is timestamp, others are data
timestamp_col = 'col_0'
df['timestamp'] = df[timestamp_col].apply(
lambda x: self.parse_timestamp(str(x), 'txt')
)
sensors = []
for i, col in enumerate(df.columns[1:], 1): # Skip timestamp column
sensors.append({
'column_name': col,
'sensor_type': 'unknown',
'model': f'sensor_{i}',
'data_type': 'irradiance', # Assume irradiance by default
'units': 'W/m²'
})
result = {
'file_path': file_path,
'file_format': 'txt',
'timestamp_column': timestamp_col,
'sensors': sensors,
'data': df,
'start_time': df['timestamp'].min(),
'end_time': df['timestamp'].max(),
'sampling_interval': self.calculate_sampling_interval(df['timestamp']),
'total_measurements': len(df)
}
logger.info(f"Processed TXT file: {file_path}")
logger.info(f"Found {len(sensors)} sensors, {len(df)} measurements")
return result
except Exception as e:
logger.error(f"Error processing TXT file {file_path}: {str(e)}")
raise
def calculate_sampling_interval(self, timestamps: pd.Series) -> float:
"""Calculate average sampling interval in seconds"""
if len(timestamps) < 2:
return 0
time_diffs = []
for i in range(1, len(timestamps)):
diff = (timestamps.iloc[i] - timestamps.iloc[i-1]).total_seconds()
time_diffs.append(diff)
return np.mean(time_diffs) if time_diffs else 0
def process_file(self, file_path: str) -> Dict:
"""Main method to process any supported file"""
if not os.path.exists(file_path):
raise FileNotFoundError(f"File not found: {file_path}")
file_format = self.detect_file_format(file_path)
if file_format in ['csv', 'csv_txt']:
return self.process_csv_file(file_path)
elif file_format == 'custom_txt':
return self.process_txt_file(file_path)
else:
raise ValueError(f"Unsupported file format: {file_format}")
def batch_process_files(self, directory_path: str) -> List[Dict]:
"""Process all supported files in a directory"""
processed_files = []
for file_path in Path(directory_path).glob('*'):
if file_path.suffix.lower() in self.supported_formats:
try:
result = self.process_file(str(file_path))
processed_files.append(result)
except Exception as e:
logger.error(f"Failed to process {file_path}: {str(e)}")
continue
return processed_files
# Example usage
if __name__ == "__main__":
processor = PyranometerFileProcessor()
# Test with a sample file
sample_data = {
'timestamp': ['2024-01-01 10:00:00', '2024-01-01 10:01:00', '2024-01-01 10:02:00'],
'kipp_zonen_cmp11': [850.5, 852.1, 851.8],
'apogee_sp110': [848.2, 849.5, 850.1],
'reference_sensor': [851.0, 852.5, 852.0],
'temperature': [25.3, 25.4, 25.5]
}
df = pd.DataFrame(sample_data)
df.to_csv('sample_pyranometer_data.csv', index=False)
# Process the sample file
try:
result = processor.process_file('sample_pyranometer_data.csv')
print("File processed successfully!")
print(f"Sensors found: {len(result['sensors'])}")
print(f"Time range: {result['start_time']} to {result['end_time']}")
print(f"Sampling interval: {result['sampling_interval']} seconds")
except Exception as e:
print(f"Error: {e}")

196
src/database/schema.sql Normal file
View File

@@ -0,0 +1,196 @@
-- Pyranometer Data Analysis System Database Schema
-- Supports multiple sensor types, calibration sessions, and advanced analysis
-- Sensor metadata table
CREATE TABLE IF NOT EXISTS sensors (
sensor_id TEXT PRIMARY KEY,
manufacturer TEXT NOT NULL,
model TEXT NOT NULL,
serial_number TEXT UNIQUE,
sensor_type TEXT CHECK(sensor_type IN ('thermopile', 'photodiode', 'reference', 'test')),
classification TEXT CHECK(classification IN ('A', 'B', 'C', 'Secondary', 'Primary')),
calibration_date DATE,
calibration_certificate TEXT,
uncertainty_value REAL, -- Expanded uncertainty at k=2
uncertainty_units TEXT DEFAULT 'W/m²',
sensitivity REAL, -- Sensor sensitivity in μV/W/m²
response_time REAL, -- Response time in seconds
spectral_range_min REAL, -- nm
spectral_range_max REAL, -- nm
temperature_coefficient REAL,
cosine_correction_factor REAL,
azimuth_correction_factor REAL,
zero_offset_a REAL,
zero_offset_b REAL,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
-- Calibration sessions table
CREATE TABLE IF NOT EXISTS calibration_sessions (
session_id TEXT PRIMARY KEY,
session_date DATE NOT NULL,
location TEXT,
operator TEXT,
reference_sensor_id TEXT REFERENCES sensors(sensor_id),
weather_conditions TEXT,
temperature REAL,
humidity REAL,
atmospheric_pressure REAL,
notes TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
-- Raw measurement data table
CREATE TABLE IF NOT EXISTS measurements (
measurement_id INTEGER PRIMARY KEY AUTOINCREMENT,
session_id TEXT REFERENCES calibration_sessions(session_id),
timestamp TIMESTAMP NOT NULL,
sensor_id TEXT REFERENCES sensors(sensor_id),
irradiance REAL NOT NULL, -- W/m²
temperature REAL, -- °C
voltage REAL, -- mV or V
quality_flag INTEGER DEFAULT 0, -- 0=good, 1=warning, 2=error
outlier_flag BOOLEAN DEFAULT FALSE,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
-- Statistical analysis results table
CREATE TABLE IF NOT EXISTS statistical_results (
result_id INTEGER PRIMARY KEY AUTOINCREMENT,
session_id TEXT REFERENCES calibration_sessions(session_id),
sensor_id TEXT REFERENCES sensors(sensor_id),
analysis_type TEXT NOT NULL,
period_start TIMESTAMP,
period_end TIMESTAMP,
-- Basic statistics
n_observations INTEGER,
mean_value REAL,
median_value REAL,
std_deviation REAL,
min_value REAL,
max_value REAL,
q1_value REAL,
q3_value REAL,
-- Advanced metrics
mbe REAL, -- Mean Bias Error
rmse REAL, -- Root Mean Square Error
mae REAL, -- Mean Absolute Error
r_squared REAL,
correlation_coefficient REAL,
theil_u REAL,
-- Uncertainty metrics
type_a_uncertainty REAL,
type_b_uncertainty REAL,
combined_uncertainty REAL,
expanded_uncertainty REAL,
coverage_factor REAL DEFAULT 2.0,
-- Performance indicators
performance_ratio REAL,
clear_sky_index REAL,
variability_index REAL,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
-- Sensor comparison results table
CREATE TABLE IF NOT EXISTS comparison_results (
comparison_id INTEGER PRIMARY KEY AUTOINCREMENT,
session_id TEXT REFERENCES calibration_sessions(session_id),
test_sensor_id TEXT REFERENCES sensors(sensor_id),
reference_sensor_id TEXT REFERENCES sensors(sensor_id),
-- Comparison metrics
bias REAL,
relative_bias REAL,
mbe REAL,
relative_mbe REAL,
rmse REAL,
relative_rmse REAL,
mae REAL,
relative_mae REAL,
r_squared REAL,
slope REAL,
intercept REAL,
-- Statistical tests
ks_statistic REAL, -- Kolmogorov-Smirnov
ks_p_value REAL,
mw_statistic REAL, -- Mann-Whitney
mw_p_value REAL,
-- Uncertainty comparison
combined_uncertainty REAL,
coverage_probability REAL,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
-- Metrology chain table
CREATE TABLE IF NOT EXISTS metrology_chain (
chain_id INTEGER PRIMARY KEY AUTOINCREMENT,
session_id TEXT REFERENCES calibration_sessions(session_id),
instrument_sequence TEXT NOT NULL, -- JSON array of instrument IDs in order
calibration_dates TEXT, -- JSON array of calibration dates
uncertainty_contributions TEXT, -- JSON with uncertainty breakdown
traceability_statement TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
-- Quality control flags table
CREATE TABLE IF NOT EXISTS quality_flags (
flag_id INTEGER PRIMARY KEY AUTOINCREMENT,
session_id TEXT REFERENCES calibration_sessions(session_id),
timestamp TIMESTAMP,
sensor_id TEXT REFERENCES sensors(sensor_id),
flag_type TEXT CHECK(flag_type IN ('physical_limit', 'extremely_rare', 'comparison', 'consistency')),
flag_value INTEGER, -- 0=good, 1=warning, 2=error
description TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
-- Indexes for performance
CREATE INDEX IF NOT EXISTS idx_measurements_timestamp ON measurements(timestamp);
CREATE INDEX IF NOT EXISTS idx_measurements_sensor_id ON measurements(sensor_id);
CREATE INDEX IF NOT EXISTS idx_measurements_session_id ON measurements(session_id);
CREATE INDEX IF NOT EXISTS idx_statistical_session_id ON statistical_results(session_id);
CREATE INDEX IF NOT EXISTS idx_comparison_session_id ON comparison_results(session_id);
-- Views for common queries
CREATE VIEW IF NOT EXISTS sensor_performance_summary AS
SELECT
s.sensor_id,
s.manufacturer,
s.model,
COUNT(DISTINCT cr.session_id) as calibration_count,
AVG(cr.bias) as avg_bias,
AVG(cr.rmse) as avg_rmse,
AVG(cr.r_squared) as avg_r_squared
FROM sensors s
LEFT JOIN comparison_results cr ON s.sensor_id = cr.test_sensor_id
GROUP BY s.sensor_id, s.manufacturer, s.model;
CREATE VIEW IF NOT EXISTS session_statistics AS
SELECT
cs.session_id,
cs.session_date,
cs.location,
COUNT(DISTINCT m.sensor_id) as sensor_count,
MIN(m.timestamp) as start_time,
MAX(m.timestamp) as end_time,
COUNT(m.measurement_id) as total_measurements
FROM calibration_sessions cs
JOIN measurements m ON cs.session_id = m.session_id
GROUP BY cs.session_id, cs.session_date, cs.location;
-- Triggers for updated_at timestamps
CREATE TRIGGER IF NOT EXISTS update_sensors_timestamp
AFTER UPDATE ON sensors
FOR EACH ROW
BEGIN
UPDATE sensors SET updated_at = CURRENT_TIMESTAMP WHERE sensor_id = NEW.sensor_id;
END;

View File

@@ -0,0 +1,343 @@
# Automated Report Generator for Pyranometer Data Analysis
# Creates comprehensive PDF reports with statistical analysis, uncertainty calculations, and visualizations
# Load required libraries
library(rmarkdown)
library(knitr)
library(ggplot2)
library(gridExtra)
library(scales)
library(lubridate)
library(dplyr)
library(tidyr)
#' Generate comprehensive pyranometer analysis report
#'
#' @param data Processed data frame with sensor measurements
#' @param statistical_results Results from statistical analysis
#' @param uncertainty_results Results from uncertainty analysis
#' @param sensor_info Information about sensors and calibration
#' @param output_dir Directory to save the report
#' @param report_title Title for the report
#' @param author_name Name of the report author
#' @param institution_name Name of the institution
#' @return Path to the generated PDF report
generate_pyranometer_report <- function(data, statistical_results, uncertainty_results,
sensor_info, output_dir = "reports",
report_title = "Pyranometer Calibration Analysis Report",
author_name = "Automated Analysis System",
institution_name = "Solar Metrology Laboratory") {
# Create output directory if it doesn't exist
if (!dir.exists(output_dir)) {
dir.create(output_dir, recursive = TRUE)
}
# Generate timestamp for report filename
timestamp <- format(Sys.time(), "%Y%m%d_%H%M%S")
report_filename <- paste0("pyranometer_report_", timestamp, ".pdf")
report_path <- file.path(output_dir, report_filename)
# Create R Markdown content
rmd_content <- create_rmd_content(data, statistical_results, uncertainty_results,
sensor_info, report_title, author_name, institution_name)
# Write R Markdown file
rmd_file <- tempfile(fileext = ".Rmd")
writeLines(rmd_content, rmd_file)
# Render PDF report
cat("Generating PDF report...\n")
render(rmd_file, output_file = report_path, output_format = "pdf_document")
# Clean up temporary file
file.remove(rmd_file)
cat("Report generated successfully:", report_path, "\n")
return(report_path)
}
#' Create R Markdown content for the report
create_rmd_content <- function(data, statistical_results, uncertainty_results,
sensor_info, title, author, institution) {
rmd_content <- c(
"---",
paste("title:", shQuote(title)),
paste("author:", shQuote(author)),
paste("institution:", shQuote(institution)),
paste("date:", shQuote(format(Sys.time(), "%B %d, %Y"))),
"output: pdf_document",
"geometry: margin=1in",
"header-includes:",
" - \\usepackage{booktabs}",
" - \\usepackage{longtable}",
" - \\usepackage{array}",
" - \\usepackage{multirow}",
" - \\usepackage{wrapfig}",
" - \\usepackage{float}",
" - \\usepackage{colortbl}",
" - \\usepackage{pdflscape}",
" - \\usepackage{tabu}",
" - \\usepackage{threeparttable}",
"---",
"",
"```{r setup, include=FALSE}",
"knitr::opts_chunk$set(echo = FALSE, warning = FALSE, message = FALSE, fig.align = 'center')",
"library(ggplot2)",
"library(dplyr)",
"library(tidyr)",
"library(scales)",
"library(gridExtra)",
"library(lubridate)",
"```",
"",
"# Executive Summary",
"",
"This report presents the comprehensive analysis of pyranometer calibration data collected during the measurement session. The analysis includes statistical comparison of test sensors against the reference standard, uncertainty calculations following ISO GUM guidelines, and performance assessment of each sensor.",
"",
"## Key Findings",
"",
"```{r executive-summary, echo=FALSE}",
"# Generate executive summary table",
"exec_summary <- data.frame()",
"for (sensor_name in names(statistical_results)) {",
" stats <- statistical_results[[sensor_name]]",
" uncertainty <- uncertainty_results[[sensor_name]]",
" ",
" summary_row <- data.frame(",
" Sensor = sensor_name,",
" MBE = round(stats$error_metrics$mbe, 2),",
" RMSE = round(stats$error_metrics$rmse, 2),",
" R_squared = round(stats$distribution_analysis$r_squared, 3),",
" Expanded_Uncertainty = round(uncertainty$expanded_uncertainty$expanded_uncertainty, 2),",
" Performance_Ratio = round(stats$performance_indicators$performance_ratio, 3)",
" )",
" exec_summary <- rbind(exec_summary, summary_row)",
"}",
"knitr::kable(exec_summary, caption = 'Executive Summary: Key Performance Metrics')",
"```",
"",
"# 1. Introduction",
"",
"## 1.1 Measurement Objectives",
"This calibration session aims to:",
"- Compare the performance of multiple pyranometers against a reference standard",
"- Calculate measurement uncertainties following international standards",
"- Assess the long-term stability and reliability of each sensor",
"- Provide traceable calibration results for quality assurance",
"",
"## 1.2 Measurement Setup",
"```{r setup-info, echo=FALSE}",
"# Display measurement setup information",
"setup_info <- data.frame(",
" Parameter = c('Reference Sensor', 'Number of Test Sensors', 'Measurement Duration',",
" 'Sampling Interval', 'Total Measurements'),",
" Value = c(sensor_info$reference_sensor, length(names(statistical_results)),",
" paste(round(difftime(max(data$timestamp), min(data$timestamp), units = 'hours'), 'hours'),",
" paste(round(mean(diff(data$timestamp)), 'seconds'),",
" nrow(data))",
")",
"knitr::kable(setup_info, caption = 'Measurement Setup Parameters')",
"```",
"",
"# 2. Data Quality Assessment",
"",
"## 2.1 Data Completeness",
"```{r data-quality, echo=FALSE}",
"# Calculate data completeness",
"completeness_stats <- data.frame()",
"for (sensor_name in names(statistical_results)) {",
" complete_cases <- sum(complete.cases(data[[sensor_name]], data[[sensor_info$reference_sensor]]))",
" total_cases <- nrow(data)",
" completeness_pct <- round(complete_cases / total_cases * 100, 1)",
" ",
" quality_row <- data.frame(",
" Sensor = sensor_name,",
" Complete_Measurements = complete_cases,",
" Total_Measurements = total_cases,",
" Completeness = paste0(completeness_pct, '%')",
" )",
" completeness_stats <- rbind(completeness_stats, quality_row)",
"}",
"knitr::kable(completeness_stats, caption = 'Data Completeness Analysis')",
"```",
"",
"## 2.2 Time Series Visualization",
"```{r time-series, echo=FALSE, fig.height=6, fig.width=10}",
"# Create time series plot",
"p1 <- ggplot(data, aes(x = timestamp)) +",
" geom_line(aes(y = !!sym(sensor_info$reference_sensor), color = 'Reference'), alpha = 0.7) +",
" labs(title = 'Solar Irradiance Time Series',",
" x = 'Time', y = 'Irradiance (W/m²)', color = 'Sensor') +",
" theme_minimal() +",
" scale_color_manual(values = c('Reference' = 'blue'))",
"",
"colors <- c('red', 'green', 'purple', 'orange', 'brown')",
"color_idx <- 1",
"for (sensor_name in names(statistical_results)) {",
" p1 <- p1 + geom_line(aes(y = !!sym(sensor_name), color = sensor_name), alpha = 0.5)",
" color_idx <- color_idx + 1",
"}",
"print(p1)",
"```",
"",
"# 3. Statistical Analysis",
"",
"## 3.1 Basic Descriptive Statistics",
"```{r descriptive-stats, echo=FALSE}",
"# Create descriptive statistics table",
"desc_stats <- data.frame()",
"for (sensor_name in names(statistical_results)) {",
" stats <- statistical_results[[sensor_name]]$basic_statistics",
" ",
" desc_row <- data.frame(",
" Sensor = sensor_name,",
" Mean = round(stats$test_mean, 1),",
" Median = round(stats$test_median, 1),",
" SD = round(stats$test_sd, 1),",
" Min = round(stats$test_min, 1),",
" Max = round(stats$test_max, 1),",
" IQR = round(stats$test_iqr, 1)",
" )",
" desc_stats <- rbind(desc_stats, desc_row)",
"}",
"knitr::kable(desc_stats, caption = 'Descriptive Statistics for Test Sensors')",
"```",
"",
"## 3.2 Error Metrics and Performance Indicators",
"```{r error-metrics, echo=FALSE}",
"# Create error metrics table",
"error_table <- data.frame()",
"for (sensor_name in names(statistical_results)) {",
" errors <- statistical_results[[sensor_name]]$error_metrics",
" performance <- statistical_results[[sensor_name]]$performance_indicators",
" ",
" error_row <- data.frame(",
" Sensor = sensor_name,",
" MBE = round(errors$mbe, 2),",
" MAE = round(errors$mae, 2),",
" RMSE = round(errors$rmse, 2),",
" R_squared = round(statistical_results[[sensor_name]]$distribution_analysis$r_squared, 3),",
" Performance_Ratio = round(performance$performance_ratio, 3),",
" Variability_Index = round(performance$variability_index, 3)",
" )",
" error_table <- rbind(error_table, error_row)",
"}",
"knitr::kable(error_table, caption = 'Error Metrics and Performance Indicators')",
"```",
"",
"## 3.3 Correlation Analysis",
"```{r correlation-plot, echo=FALSE, fig.height=6, fig.width=8}",
"# Create correlation scatter plots",
"plot_list <- list()",
"for (sensor_name in names(statistical_results)[1:min(4, length(statistical_results))]) {",
" p <- ggplot(data, aes(x = !!sym(sensor_info$reference_sensor), y = !!sym(sensor_name))) +",
" geom_point(alpha = 0.3, size = 0.5) +",
" geom_smooth(method = 'lm', se = TRUE, color = 'red') +",
" labs(title = paste('Correlation:', sensor_name),",
" x = 'Reference (W/m²)', y = paste(sensor_name, '(W/m²)')) +",
" theme_minimal()",
" plot_list[[sensor_name]] <- p",
"}",
"grid.arrange(grobs = plot_list, ncol = 2)",
"```",
"",
"# 4. Uncertainty Analysis",
"",
"## 4.1 Measurement Uncertainty Budget",
"```{r uncertainty-budget, echo=FALSE}",
"# Create uncertainty budget table",
"uncertainty_table <- data.frame()",
"for (sensor_name in names(uncertainty_results)) {",
" uncertainty <- uncertainty_results[[sensor_name]]",
" ",
" uncertainty_row <- data.frame(",
" Sensor = sensor_name,",
" Type_A_Uncertainty = round(uncertainty$type_a_uncertainty$standard_uncertainty, 2),",
" Type_B_Uncertainty = round(uncertainty$type_b_uncertainty$combined_uncertainty, 2),",
" Combined_Uncertainty = round(uncertainty$combined_uncertainty$combined_standard_uncertainty, 2),",
" Expanded_Uncertainty = round(uncertainty$expanded_uncertainty$expanded_uncertainty, 2),",
" Coverage_Factor = uncertainty$coverage_factor",
" )",
" uncertainty_table <- rbind(uncertainty_table, uncertainty_row)",
"}",
"knitr::kable(uncertainty_table, caption = 'Measurement Uncertainty Analysis')",
"```",
"",
"## 4.2 Uncertainty Contributions",
"```{r uncertainty-pie, echo=FALSE, fig.height=6, fig.width=8}",
"# Create uncertainty contribution plot",
"if (length(uncertainty_results) > 0) {",
" sensor_name <- names(uncertainty_results)[1]",
" uncertainty <- uncertainty_results[[sensor_name]]",
" ",
" type_a_contrib <- uncertainty$type_a_uncertainty$standard_uncertainty^2",
" type_b_contrib <- uncertainty$type_b_uncertainty$combined_uncertainty^2",
" total_variance <- type_a_contrib + type_b_contrib",
" ",
" contrib_data <- data.frame(",
" Source = c('Type A (Random)', 'Type B (Systematic)'),",
" Contribution = c(type_a_contrib/total_variance * 100, type_b_contrib/total_variance * 100)",
" )",
" ",
" p <- ggplot(contrib_data, aes(x = '', y = Contribution, fill = Source)) +",
" geom_bar(stat = 'identity', width = 1) +",
" coord_polar('y', start = 0) +",
" labs(title = paste('Uncertainty Contribution for', sensor_name),",
" fill = 'Uncertainty Type') +",
" theme_void() +",
" geom_text(aes(label = paste0(round(Contribution, 1), '%')),",
" position = position_stack(vjust = 0.5))",
" print(p)",
"}",
"```",
"",
"# 5. Statistical Tests and Distribution Analysis",
"",
"## 5.1 Hypothesis Testing Results",
"```{r statistical-tests, echo=FALSE}",
"# Create statistical tests table",
"test_table <- data.frame()",
"for (sensor_name in names(statistical_results)) {",
" tests <- statistical_results[[sensor_name]]$statistical_tests",
" ",
" test_row <- data.frame(",
" Sensor = sensor_name,",
" KS_Statistic = round(tests$ks_statistic, 4),",
" KS_p_value = format.pval(tests$ks_p_value, digits = 3),",
" MW_Statistic = round(tests$mw_statistic, 1),",
" MW_p_value = format.pval(tests$mw_p_value, digits = 3),",
" Normality_p_value = format.pval(statistical_results[[sensor_name]]$distribution_analysis$errors_normality_p, digits = 3)",
" )",
" test_table <- rbind(test_table, test_row)",
"}",
"knitr::kable(test_table, caption = 'Statistical Hypothesis Test Results')",
"```",
"",
"## 5.2 Error Distribution",
"```{r error-distribution, echo=FALSE, fig.height=6, fig.width=10}",
"# Create error distribution plots",
"plot_list <- list()",
"for (sensor_name in names(statistical_results)[1:min(4, length(statistical_results))]) {",
" errors <- data[[sensor_name]] - data[[sensor_info$reference_sensor]]",
" error_data <- data.frame(Errors = errors[!is.na(errors)])",
" ",
" p <- ggplot(error_data, aes(x = Errors)) +",
" geom_histogram(aes(y = ..density..), bins = 30, fill = 'lightblue', alpha = 0.7) +",
" geom_density(color = 'red', size = 1) +",
" geom_vline(xintercept = mean(errors, na.rm = TRUE), color = 'blue', linetype = 'dashed', size = 1) +",
" labs(title = paste('Error Distribution:', sensor_name),",
" x = 'Error (W/m²)', y = 'Density') +",
" theme_minimal()",
" plot_list[[sensor_name]] <- p",
"}",
"grid.arrange(grobs = plot_list, ncol = 2)",
"```",
"",
"# 6. Conclusions and Recommendations",
"",
"## 6.1 Performance Classification",
"Based on the analysis results, sensors can be classified as follows:",
"",
"```{r classification, echo