From 78eeaf1364d3cb97e223d20945c7652a46c1b7d8 Mon Sep 17 00:00:00 2001 From: dino Date: Tue, 10 Feb 2026 10:53:06 +0100 Subject: [PATCH] Initial commit - SolSolAutomazione project --- .gitignore | 45 ++ R-VsCode/r_vscode_quick_start.md | 93 ++++ R-VsCode/r_vscode_setup_guide.md | 32 ++ R-VsCode/test_r_script.R | 55 +++ project_structure.md | 75 +++ src/analysis/statistical/advanced_metrics.R | 340 ++++++++++++++ .../uncertainty/uncertainty_calculator.R | 432 ++++++++++++++++++ src/data_cleaning/validator.py | 342 ++++++++++++++ src/data_ingestion/file_processor.py | 308 +++++++++++++ src/database/schema.sql | 196 ++++++++ src/reporting/report_generator.R | 343 ++++++++++++++ 11 files changed, 2261 insertions(+) create mode 100644 .gitignore create mode 100644 R-VsCode/r_vscode_quick_start.md create mode 100644 R-VsCode/r_vscode_setup_guide.md create mode 100644 R-VsCode/test_r_script.R create mode 100644 project_structure.md create mode 100644 src/analysis/statistical/advanced_metrics.R create mode 100644 src/analysis/uncertainty/uncertainty_calculator.R create mode 100644 src/data_cleaning/validator.py create mode 100644 src/data_ingestion/file_processor.py create mode 100644 src/database/schema.sql create mode 100644 src/reporting/report_generator.R diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9f05391 --- /dev/null +++ b/.gitignore @@ -0,0 +1,45 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +env/ +venv/ +ENV/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# R +.Rproj.user +.Rhistory +.RData +.Ruserdata +*.Rproj + +# IDE +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# OS +.DS_Store +Thumbs.db + +# Logs +*.log \ No newline at end of file diff --git a/R-VsCode/r_vscode_quick_start.md b/R-VsCode/r_vscode_quick_start.md new file mode 100644 index 0000000..1060d26 --- /dev/null +++ b/R-VsCode/r_vscode_quick_start.md @@ -0,0 +1,93 @@ +# Quick Start: Testing R in VSCode + +## Step 1: Install R Extensions +1. Open VSCode +2. Go to Extensions panel (Ctrl+Shift+X) +3. Search for and install: + - "R" by REditorSupport + - "R Debugger" by RDebugger + - "R LSP Client" by REditorSupport + +## Step 2: Create Your First R File +1. Create a new file (Ctrl+N) +2. Save it with `.R` extension (e.g., `test.R`) +3. Type some basic R code: + ```r + print("Hello R!") + x <- 1:5 + mean(x) + ``` + +## Step 3: Run R Code in VSCode +### Method A: Using R Terminal +1. Open Terminal in VSCode (Ctrl+` ) +2. Type `R` to start R interactive session +3. Type commands directly: + ```r + > 2 + 2 + [1] 4 + > plot(1:10) + ``` + +### Method B: Run Entire Script +1. Open your `.R` file +2. Press Ctrl+Shift+P +3. Type "R: Run Source" and press Enter +4. Output appears in R Terminal + +### Method C: Run Selected Code +1. Select code in your R file +2. Right-click → "Run Selected Line(s)" +3. Or use shortcut: Ctrl+Enter + +## Step 4: View Plots +1. Run plot commands in R terminal: + ```r + plot(1:10, main="Test Plot") + ``` +2. Plots appear in VSCode's Plot viewer panel + +## Step 5: Use R Markdown (Optional) +1. Create new file with `.Rmd` extension +2. Add code chunks: + ```r + ```{r} + summary(mtcars) + ``` + ``` +3. Click "Knit" button to render document + +## Step 6: Debug R Code +1. Set breakpoint by clicking left margin +2. Press F5 to start debugging +3. Step through code with F10/F11 + +## Quick Test Commands to Try +In R terminal, test these: +```r +# Basic math +2 + 2 +sqrt(16) + +# Create data +data <- c(1, 3, 5, 7, 9) +mean(data) + +# Simple plot +plot(data, type="l", col="blue") + +# Check packages +installed.packages()[1:5,] +``` + +## What You Should See +- ✅ Code completion as you type +- ✅ Syntax highlighting +- ✅ R terminal with `>` prompt +- ✅ Plots in separate viewer +- ✅ Variable values in debugger + +## Troubleshooting +- If R isn't found: Check R path in extension settings +- Restart VSCode after installing extensions +- Ensure R is in system PATH diff --git a/R-VsCode/r_vscode_setup_guide.md b/R-VsCode/r_vscode_setup_guide.md new file mode 100644 index 0000000..064d77b --- /dev/null +++ b/R-VsCode/r_vscode_setup_guide.md @@ -0,0 +1,32 @@ +# Setting up R in VSCode + +## Current Status +✅ **R is installed**: Version 4.4.3 (2025-02-28) -- "Trophy Case" +✅ **R executable**: `/usr/bin/R` + +## Required VSCode Extensions + +Install these extensions for optimal R development: + +### Essential Extensions: +1. **R Extension for Visual Studio Code** (by REditorSupport) + - Provides syntax highlighting, code completion, and R integration + - Install with: `codium --install-extension REditorSupport.r` + +2. **R Debugger** (by RDebugger) + - Debugging support for R scripts + - Install with: `codium --install-extension RDebugger.r-debugger` + +3. **R LSP Client** (by REditorSupport) + - Language Server Protocol support for R + - In VS Code, go to Extensions (Ctrl+Shift+X) and search for: + "R LSP Client" by REditorSupport +### Optional Extensions: +- **R Markdown Preview** - For R Markdown files +- **Quarto** - For modern R documents and reports + +## Installation Commands + +Run these commands to install the essential extensions: + + diff --git a/R-VsCode/test_r_script.R b/R-VsCode/test_r_script.R new file mode 100644 index 0000000..6c82913 --- /dev/null +++ b/R-VsCode/test_r_script.R @@ -0,0 +1,55 @@ +# Test R Script for VSCode Setup +# This script demonstrates basic R functionality + +# Basic operations +x <- 1:10 +y <- x^2 + +cat("=== Basic R Test ===\n") +cat("Vector x:", x, "\n") +cat("Vector y:", y, "\n") +cat("Mean of x:", mean(x), "\n") +cat("Sum of y:", sum(y), "\n") +cat("Standard deviation of x:", sd(x), "\n\n") + +# Data frame example +df <- data.frame( + id = 1:5, + name = c("Alice", "Bob", "Charlie", "Diana", "Eve"), + age = c(25, 30, 35, 28, 32), + score = c(85, 92, 78, 96, 88) +) + +cat("=== Data Frame Example ===\n") +print(df) +cat("\n") + +# Statistical summary +cat("=== Statistical Summary ===\n") +cat("Summary of scores:\n") +print(summary(df$score)) +cat("\n") + +# Simple function +calculate_stats <- function(data) { + list( + mean = mean(data), + median = median(data), + min = min(data), + max = max(data) + ) +} + +cat("=== Custom Function Test ===\n") +stats <- calculate_stats(df$score) +print(stats) +cat("\n") + +# Plot example (will create a plot if run in R) +cat("=== Plot Information ===\n") +cat("To create a plot, run this script in R and use:\n") +cat("plot(x, y, main = 'Test Plot: y = x^2', type = 'b', col = 'blue')\n") +cat("hist(df$score, main = 'Score Distribution', col = 'lightgreen')\n\n") + +cat("✅ R script executed successfully!\n") +cat("Your R environment is working correctly.\n") diff --git a/project_structure.md b/project_structure.md new file mode 100644 index 0000000..3efa8d9 --- /dev/null +++ b/project_structure.md @@ -0,0 +1,75 @@ +# Pyranometer Data Analysis System - Project Structure + +## Core Components + +### 1. Configuration & Setup +- `config/` - Configuration files +- `requirements/` - Dependencies and setup scripts + +### 2. Data Processing +- `src/data_ingestion/` - CSV/TXT file processing +- `src/data_cleaning/` - Data validation and cleaning +- `src/database/` - Database operations and schema + +### 3. Analysis Engine +- `src/analysis/statistical/` - Statistical metrics and tests +- `src/analysis/uncertainty/` - Uncertainty calculations +- `src/analysis/comparison/` - Sensor comparison methods + +### 4. Reporting & Visualization +- `src/reporting/` - Automated report generation +- `src/visualization/` - Plots and dashboards + +### 5. Automation & Scheduling +- `scripts/` - Automation scripts +- `scheduling/` - Cron jobs and scheduling + +## File Structure +``` +pyranometer_analysis/ +├── config/ +│ ├── system_config.yaml +│ ├── sensor_database.csv +│ └── calibration_standards.yaml +├── src/ +│ ├── data_ingestion/ +│ │ ├── file_processor.py +│ │ ├── csv_parser.py +│ │ └── txt_parser.py +│ ├── data_cleaning/ +│ │ ├── validator.py +│ │ ├── quality_control.py +│ │ └── outlier_detection.py +│ ├── database/ +│ │ ├── schema.sql +│ │ ├── operations.py +│ │ └── migrations/ +│ ├── analysis/ +│ │ ├── statistical/ +│ │ │ ├── basic_metrics.R +│ │ │ ├── advanced_metrics.R +│ │ │ └── time_series_analysis.R +│ │ ├── uncertainty/ +│ │ │ ├── uncertainty_calculator.R +│ │ │ └── monte_carlo_simulation.R +│ │ └── comparison/ +│ │ ├── sensor_comparison.R +│ │ └── performance_analysis.R +│ ├── reporting/ +│ │ ├── report_generator.R +│ │ ├── template.Rmd +│ │ └── pdf_renderer.py +│ └── visualization/ +│ ├── plots.R +│ └── dashboard.py +├── scripts/ +│ ├── run_analysis.py +│ ├── automated_report.py +│ └── data_import.py +├── tests/ +│ ├── test_data_processing.py +│ └── test_analysis.py +└── data/ + ├── raw/ # Input CSV/TXT files + ├── processed/ # Cleaned data + └── reports/ # Generated reports diff --git a/src/analysis/statistical/advanced_metrics.R b/src/analysis/statistical/advanced_metrics.R new file mode 100644 index 0000000..bc41411 --- /dev/null +++ b/src/analysis/statistical/advanced_metrics.R @@ -0,0 +1,340 @@ +# Advanced Statistical Analysis for Pyranometer Data +# Implements research-backed statistical methods for solar irradiance analysis + +# Load required libraries +library(dplyr) +library(tidyr) +library(lubridate) +library(forecast) +library(tseries) +library(nortest) +library(DescTools) +library(moments) + +#' Calculate comprehensive statistical metrics for pyranometer data +#' +#' @param data Data frame with timestamp and sensor columns +#' @param reference_col Name of reference sensor column +#' @param test_cols Vector of test sensor column names +#' @param timestamp_col Name of timestamp column (default: "timestamp") +#' @return List containing comprehensive statistical analysis +calculate_advanced_metrics <- function(data, reference_col, test_cols, timestamp_col = "timestamp") { + + results <- list() + + for (test_col in test_cols) { + cat("Analyzing sensor:", test_col, "vs reference:", reference_col, "\n") + + # Extract clean data (remove NAs) + clean_data <- data[complete.cases(data[[reference_col]], data[[test_col]]), ] + + if (nrow(clean_data) < 10) { + warning(paste("Insufficient data for comparison:", test_col, "vs", reference_col)) + next + } + + reference <- clean_data[[reference_col]] + test <- clean_data[[test_col]] + + # Basic descriptive statistics + basic_stats <- calculate_basic_statistics(test, reference) + + # Error metrics + error_metrics <- calculate_error_metrics(test, reference) + + # Distribution analysis + distribution_analysis <- analyze_distribution(test, reference) + + # Time series analysis + time_series_analysis <- analyze_time_series(clean_data, test_col, reference_col, timestamp_col) + + # Statistical tests + statistical_tests <- perform_statistical_tests(test, reference) + + # Performance indicators + performance_indicators <- calculate_performance_indicators(test, reference) + + results[[test_col]] <- list( + basic_statistics = basic_stats, + error_metrics = error_metrics, + distribution_analysis = distribution_analysis, + time_series_analysis = time_series_analysis, + statistical_tests = statistical_tests, + performance_indicators = performance_indicators, + sample_size = nrow(clean_data) + ) + } + + return(results) +} + +#' Calculate basic descriptive statistics +calculate_basic_statistics <- function(test, reference) { + list( + test_mean = mean(test, na.rm = TRUE), + test_median = median(test, na.rm = TRUE), + test_sd = sd(test, na.rm = TRUE), + test_min = min(test, na.rm = TRUE), + test_max = max(test, na.rm = TRUE), + test_q1 = quantile(test, 0.25, na.rm = TRUE), + test_q3 = quantile(test, 0.75, na.rm = TRUE), + test_iqr = IQR(test, na.rm = TRUE), + test_skewness = moments::skewness(test, na.rm = TRUE), + test_kurtosis = moments::kurtosis(test, na.rm = TRUE), + + reference_mean = mean(reference, na.rm = TRUE), + reference_median = median(reference, na.rm = TRUE), + reference_sd = sd(reference, na.rm = TRUE) + ) +} + +#' Calculate comprehensive error metrics +calculate_error_metrics <- function(test, reference) { + errors <- test - reference + + list( + # Absolute errors + mbe = mean(errors, na.rm = TRUE), # Mean Bias Error + mae = mean(abs(errors), na.rm = TRUE), # Mean Absolute Error + rmse = sqrt(mean(errors^2, na.rm = TRUE)), # Root Mean Square Error + + # Relative errors (percentage) + relative_mbe = (mean(errors, na.rm = TRUE) / mean(reference, na.rm = TRUE)) * 100, + relative_mae = (mean(abs(errors), na.rm = TRUE) / mean(reference, na.rm = TRUE)) * 100, + relative_rmse = (sqrt(mean(errors^2, na.rm = TRUE)) / mean(reference, na.rm = TRUE)) * 100, + + # Advanced error metrics + mape = mean(abs(errors / reference), na.rm = TRUE) * 100, # Mean Absolute Percentage Error + smape = 100 * mean(2 * abs(errors) / (abs(test) + abs(reference)), na.rm = TRUE), # Symmetric MAPE + nrmse = sqrt(mean(errors^2, na.rm = TRUE)) / (max(reference, na.rm = TRUE) - min(reference, na.rm = TRUE)), # Normalized RMSE + + # Theil's decomposition + theil_u = theils_u(test, reference), + + # Error distribution + error_mean = mean(errors, na.rm = TRUE), + error_sd = sd(errors, na.rm = TRUE), + error_skewness = moments::skewness(errors, na.rm = TRUE), + error_kurtosis = moments::kurtosis(errors, na.rm = TRUE) + ) +} + +#' Calculate Theil's U statistic +theils_u <- function(actual, predicted) { + # Theil's U statistic for forecast accuracy + n <- length(actual) + numerator <- sqrt(sum((predicted - actual)^2) / n) + denominator <- sqrt(sum(actual^2) / n) + sqrt(sum(predicted^2) / n) + return(numerator / denominator) +} + +#' Analyze distribution characteristics +analyze_distribution <- function(test, reference) { + list( + # Correlation analysis + pearson_correlation = cor(test, reference, method = "pearson"), + spearman_correlation = cor(test, reference, method = "spearman"), + kendall_correlation = cor(test, reference, method = "kendall"), + + # Regression analysis + regression_slope = coef(lm(test ~ reference))[2], + regression_intercept = coef(lm(test ~ reference))[1], + r_squared = summary(lm(test ~ reference))$r.squared, + adjusted_r_squared = summary(lm(test ~ reference))$adj.r.squared, + + # Distribution tests + test_normality_p = shapiro.test(test)$p.value, + reference_normality_p = shapiro.test(reference)$p.value, + errors_normality_p = shapiro.test(test - reference)$p.value + ) +} + +#' Perform comprehensive time series analysis +analyze_time_series <- function(data, test_col, reference_col, timestamp_col) { + + # Ensure data is sorted by timestamp + data <- data[order(data[[timestamp_col]]), ] + + test_ts <- data[[test_col]] + reference_ts <- data[[reference_col]] + + # Calculate time differences + time_diffs <- diff(data[[timestamp_col]]) + avg_interval <- mean(as.numeric(time_diffs, units = "secs")) + + # Autocorrelation analysis + test_acf <- acf(test_ts, plot = FALSE) + reference_acf <- acf(reference_ts, plot = FALSE) + + # Partial autocorrelation + test_pacf <- pacf(test_ts, plot = FALSE) + reference_pacf <- pacf(reference_ts, plot = FALSE) + + # Cross-correlation + ccf_result <- ccf(test_ts, reference_ts, plot = FALSE) + + # Stationarity tests + test_adf <- tryCatch( + adf.test(test_ts)$p.value, + error = function(e) NA + ) + + reference_adf <- tryCatch( + adf.test(reference_ts)$p.value, + error = function(e) NA + ) + + list( + sampling_interval_seconds = avg_interval, + test_autocorrelation_lag1 = test_acf$acf[2], + reference_autocorrelation_lag1 = reference_acf$acf[2], + cross_correlation_max = max(ccf_result$acf), + cross_correlation_lag = ccf_result$lag[which.max(ccf_result$acf)], + test_stationarity_p = test_adf, + reference_stationarity_p = reference_adf + ) +} + +#' Perform statistical hypothesis tests +perform_statistical_tests <- function(test, reference) { + errors <- test - reference + + # Kolmogorov-Smirnov test for distribution similarity + ks_test <- tryCatch( + ks.test(test, reference), + error = function(e) list(statistic = NA, p.value = NA) + ) + + # Mann-Whitney U test (non-parametric) + mw_test <- tryCatch( + wilcox.test(test, reference), + error = function(e) list(statistic = NA, p.value = NA) + ) + + # Variance equality test (F-test) + var_test <- tryCatch( + var.test(test, reference), + error = function(e) list(statistic = NA, p.value = NA) + ) + + # Anderson-Darling test for normality of errors + ad_test <- tryCatch( + nortest::ad.test(errors), + error = function(e) list(statistic = NA, p.value = NA) + ) + + list( + ks_statistic = ks_test$statistic, + ks_p_value = ks_test$p.value, + mw_statistic = mw_test$statistic, + mw_p_value = mw_test$p.value, + f_statistic = var_test$statistic, + f_p_value = var_test$p.value, + ad_statistic = ad_test$statistic, + ad_p_value = ad_test$p.value + ) +} + +#' Calculate performance indicators specific to solar applications +calculate_performance_indicators <- function(test, reference) { + + # Performance Ratio (simplified) + performance_ratio <- mean(test, na.rm = TRUE) / mean(reference, na.rm = TRUE) + + # Clear Sky Index (simplified - assumes reference is clear sky) + clear_sky_index <- mean(test / reference, na.rm = TRUE) + + # Variability index + variability_index <- sd(test, na.rm = TRUE) / mean(test, na.rm = TRUE) + + # Consistency metrics + consistency_ratio <- 1 - (sd(test - reference, na.rm = TRUE) / mean(reference, na.rm = TRUE)) + + list( + performance_ratio = performance_ratio, + clear_sky_index = clear_sky_index, + variability_index = variability_index, + consistency_ratio = consistency_ratio + ) +} + +#' Generate comprehensive statistical report +generate_statistical_report <- function(analysis_results, output_file = NULL) { + + report <- list() + + for (sensor_name in names(analysis_results)) { + sensor_results <- analysis_results[[sensor_name]] + + sensor_report <- list( + sensor_name = sensor_name, + sample_size = sensor_results$sample_size, + + # Summary statistics + mean_bias_error = sensor_results$error_metrics$mbe, + root_mean_square_error = sensor_results$error_metrics$rmse, + mean_absolute_error = sensor_results$error_metrics$mae, + + # Correlation and fit + r_squared = sensor_results$distribution_analysis$r_squared, + pearson_correlation = sensor_results$distribution_analysis$pearson_correlation, + + # Statistical significance + ks_p_value = sensor_results$statistical_tests$ks_p_value, + mw_p_value = sensor_results$statistical_tests$mw_p_value, + + # Performance indicators + performance_ratio = sensor_results$performance_indicators$performance_ratio, + variability_index = sensor_results$performance_indicators$variability_index + ) + + report[[sensor_name]] <- sensor_report + } + + # Convert to data frame for easier viewing + report_df <- do.call(rbind, lapply(report, as.data.frame)) + + if (!is.null(output_file)) { + write.csv(report_df, output_file, row.names = FALSE) + cat("Statistical report saved to:", output_file, "\n") + } + + return(report_df) +} + +# Example usage and testing +if (FALSE) { + # Create sample data + set.seed(123) + n <- 1000 + timestamp <- seq(as.POSIXct("2024-01-01 06:00:00"), by = "1 min", length.out = n) + + # Simulate reference sensor (clear sky conditions with some noise) + reference <- 800 + 200 * sin(seq(0, 2*pi, length.out = n)) + rnorm(n, 0, 20) + + # Simulate test sensors with different characteristics + sensor1 <- reference + rnorm(n, 5, 15) # Small bias, good precision + sensor2 <- reference * 1.02 + rnorm(n, 0, 30) # 2% overestimation, poorer precision + sensor3 <- reference + rnorm(n, -10, 25) # Negative bias + + sample_data <- data.frame( + timestamp = timestamp, + reference_sensor = reference, + test_sensor_1 = sensor1, + test_sensor_2 = sensor2, + test_sensor_3 = sensor3 + ) + + # Run analysis + results <- calculate_advanced_metrics( + data = sample_data, + reference_col = "reference_sensor", + test_cols = c("test_sensor_1", "test_sensor_2", "test_sensor_3") + ) + + # Generate report + report <- generate_statistical_report(results, "sensor_comparison_report.csv") + + print("Advanced statistical analysis completed!") + print(report) +} diff --git a/src/analysis/uncertainty/uncertainty_calculator.R b/src/analysis/uncertainty/uncertainty_calculator.R new file mode 100644 index 0000000..1327242 --- /dev/null +++ b/src/analysis/uncertainty/uncertainty_calculator.R @@ -0,0 +1,432 @@ +# Uncertainty Calculation Engine for Pyranometer Data +# Implements ISO GUM (Guide to the Expression of Uncertainty in Measurement) standards +# and advanced uncertainty analysis methods for solar irradiance measurements + +# Load required libraries +library(dplyr) +library(tidyr) +library(moments) +library(boot) + +#' Calculate comprehensive measurement uncertainty for pyranometer data +#' +#' @param data Data frame with sensor measurements +#' @param reference_col Name of reference sensor column +#' @param test_cols Vector of test sensor column names +#' @param sensor_specs List containing sensor specifications and calibration data +#' @param coverage_factor Coverage factor for expanded uncertainty (default: 2 for 95% confidence) +#' @return List containing comprehensive uncertainty analysis +calculate_measurement_uncertainty <- function(data, reference_col, test_cols, + sensor_specs = NULL, coverage_factor = 2) { + + results <- list() + + for (test_col in test_cols) { + cat("Calculating uncertainty for sensor:", test_col, "\n") + + # Extract clean data + clean_data <- data[complete.cases(data[[reference_col]], data[[test_col]]), ] + + if (nrow(clean_data) < 10) { + warning(paste("Insufficient data for uncertainty analysis:", test_col)) + next + } + + reference <- clean_data[[reference_col]] + test <- clean_data[[test_col]] + errors <- test - reference + + # Type A uncertainty (statistical analysis of measurements) + type_a_uncertainty <- calculate_type_a_uncertainty(errors, test, reference) + + # Type B uncertainty (systematic uncertainties from specifications) + type_b_uncertainty <- calculate_type_b_uncertainty(test_col, sensor_specs, mean(reference)) + + # Combined standard uncertainty + combined_uncertainty <- calculate_combined_uncertainty(type_a_uncertainty, type_b_uncertainty) + + # Expanded uncertainty + expanded_uncertainty <- calculate_expanded_uncertainty(combined_uncertainty, coverage_factor) + + # Uncertainty budget + uncertainty_budget <- create_uncertainty_budget(type_a_uncertainty, type_b_uncertainty, + combined_uncertainty, expanded_uncertainty) + + # Monte Carlo simulation for complex uncertainty propagation + monte_carlo_results <- perform_monte_carlo_simulation(test, reference, sensor_specs, test_col) + + results[[test_col]] <- list( + type_a_uncertainty = type_a_uncertainty, + type_b_uncertainty = type_b_uncertainty, + combined_uncertainty = combined_uncertainty, + expanded_uncertainty = expanded_uncertainty, + uncertainty_budget = uncertainty_budget, + monte_carlo_simulation = monte_carlo_results, + coverage_factor = coverage_factor, + sample_size = nrow(clean_data) + ) + } + + return(results) +} + +#' Calculate Type A uncertainty (statistical analysis) +calculate_type_a_uncertainty <- function(errors, test, reference) { + + n <- length(errors) + + # Standard uncertainty from repeated measurements + standard_uncertainty <- sd(errors, na.rm = TRUE) / sqrt(n) + + # Uncertainty of the mean + uncertainty_of_mean <- sd(test, na.rm = TRUE) / sqrt(n) + + # Regression uncertainty (if using regression model) + regression_uncertainty <- calculate_regression_uncertainty(test, reference) + + # Autocorrelation-corrected uncertainty + autocorr_uncertainty <- calculate_autocorrelation_uncertainty(errors) + + list( + standard_uncertainty = standard_uncertainty, + uncertainty_of_mean = uncertainty_of_mean, + regression_uncertainty = regression_uncertainty, + autocorrelation_uncertainty = autocorr_uncertainty, + standard_deviation = sd(errors, na.rm = TRUE), + standard_error = sd(errors, na.rm = TRUE) / sqrt(n), + degrees_of_freedom = n - 1 + ) +} + +#' Calculate Type B uncertainty (systematic uncertainties) +calculate_type_b_uncertainty <- function(sensor_name, sensor_specs, mean_irradiance) { + + # Default uncertainties if no specifications provided + default_uncertainties <- list( + calibration_uncertainty = 0.02, # 2% calibration uncertainty + temperature_uncertainty = 0.005, # 0.5% per °C temperature effect + cosine_uncertainty = 0.01, # 1% cosine response uncertainty + azimuth_uncertainty = 0.002, # 0.2% azimuth uncertainty + spectral_uncertainty = 0.015, # 1.5% spectral mismatch + linearity_uncertainty = 0.005, # 0.5% non-linearity + stability_uncertainty = 0.01, # 1% long-term stability + resolution_uncertainty = 0.001 # 0.1% resolution uncertainty + ) + + # Use sensor-specific specifications if available + if (!is.null(sensor_specs) && sensor_name %in% names(sensor_specs)) { + specs <- sensor_specs[[sensor_name]] + uncertainties <- modifyList(default_uncertainties, specs) + } else { + uncertainties <- default_uncertainties + } + + # Convert relative uncertainties to absolute (W/m²) + absolute_uncertainties <- lapply(uncertainties, function(u) u * mean_irradiance) + + # Calculate combined Type B uncertainty + type_b_combined <- sqrt(sum(sapply(absolute_uncertainties, function(x) x^2))) + + list( + relative_uncertainties = uncertainties, + absolute_uncertainties = absolute_uncertainties, + combined_uncertainty = type_b_combined, + effective_degrees_of_freedom = Inf # Type B uncertainties typically have infinite df + ) +} + +#' Calculate regression-related uncertainty +calculate_regression_uncertainty <- function(test, reference) { + + # Fit linear regression model + model <- lm(test ~ reference) + + # Standard error of the regression + regression_se <- summary(model)$sigma + + # Prediction interval uncertainty + n <- length(test) + prediction_uncertainty <- regression_se * sqrt(1 + 1/n) + + list( + regression_standard_error = regression_se, + prediction_uncertainty = prediction_uncertainty, + residual_standard_error = sd(residuals(model)) + ) +} + +#' Calculate uncertainty corrected for autocorrelation +calculate_autocorrelation_uncertainty <- function(errors) { + + n <- length(errors) + + # Calculate autocorrelation at lag 1 + acf_result <- acf(errors, plot = FALSE, lag.max = 1) + rho <- acf_result$acf[2] + + # Effective sample size considering autocorrelation + effective_n <- n * (1 - rho) / (1 + rho) + + # Autocorrelation-corrected uncertainty + corrected_uncertainty <- sd(errors, na.rm = TRUE) / sqrt(effective_n) + + list( + autocorrelation_coefficient = rho, + effective_sample_size = effective_n, + corrected_uncertainty = corrected_uncertainty + ) +} + +#' Calculate combined standard uncertainty +calculate_combined_uncertainty <- function(type_a, type_b) { + + # Combine Type A and Type B uncertainties + combined <- sqrt(type_a$standard_uncertainty^2 + type_b$combined_uncertainty^2) + + # Calculate effective degrees of freedom using Welch-Satterthwaite formula + u_a <- type_a$standard_uncertainty + u_b <- type_b$combined_uncertainty + df_a <- type_a$degrees_of_freedom + df_b <- type_b$effective_degrees_of_freedom + + effective_df <- (combined^4) / ((u_a^4 / df_a) + (u_b^4 / df_b)) + + list( + combined_standard_uncertainty = combined, + effective_degrees_of_freedom = effective_df, + coverage_factor_95 = qt(0.975, effective_df) + ) +} + +#' Calculate expanded uncertainty +calculate_expanded_uncertainty <- function(combined_uncertainty, coverage_factor = 2) { + + expanded <- coverage_factor * combined_uncertainty$combined_standard_uncertainty + + list( + expanded_uncertainty = expanded, + coverage_factor = coverage_factor, + confidence_level = ifelse(coverage_factor == 2, 0.9545, + ifelse(coverage_factor == 1.96, 0.95, NA)) + ) +} + +#' Create detailed uncertainty budget +create_uncertainty_budget <- function(type_a, type_b, combined, expanded) { + + budget <- list( + type_a_components = list( + standard_uncertainty = type_a$standard_uncertainty, + uncertainty_of_mean = type_a$uncertainty_of_mean, + regression_uncertainty = type_a$regression_uncertainty$regression_standard_error, + autocorrelation_uncertainty = type_a$autocorrelation_uncertainty$corrected_uncertainty + ), + + type_b_components = type_b$absolute_uncertainties, + + summary = list( + combined_standard_uncertainty = combined$combined_standard_uncertainty, + expanded_uncertainty = expanded$expanded_uncertainty, + coverage_factor = expanded$coverage_factor, + effective_degrees_of_freedom = combined$effective_degrees_of_freedom + ) + ) + + return(budget) +} + +#' Perform Monte Carlo simulation for complex uncertainty propagation +perform_monte_carlo_simulation <- function(test, reference, sensor_specs, sensor_name, + n_simulations = 10000) { + + n <- length(test) + mean_test <- mean(test, na.rm = TRUE) + mean_reference <- mean(reference, na.rm = TRUE) + + # Get sensor specifications + if (!is.null(sensor_specs) && sensor_name %in% names(sensor_specs)) { + specs <- sensor_specs[[sensor_name]] + } else { + specs <- list( + calibration_uncertainty = 0.02, + temperature_uncertainty = 0.005, + cosine_uncertainty = 0.01 + ) + } + + # Initialize simulation results + simulated_biases <- numeric(n_simulations) + + for (i in 1:n_simulations) { + # Simulate random errors for each uncertainty component + cal_error <- rnorm(1, 0, specs$calibration_uncertainty * mean_reference) + temp_error <- rnorm(1, 0, specs$temperature_uncertainty * mean_reference) + cosine_error <- rnorm(1, 0, specs$cosine_uncertainty * mean_reference) + + # Simulate random measurement errors + measurement_error <- rnorm(1, 0, sd(test - reference, na.rm = TRUE)) + + # Total simulated bias + total_error <- cal_error + temp_error + cosine_error + measurement_error + simulated_biases[i] <- total_error + } + + # Calculate statistics from simulation + simulation_stats <- list( + mean_bias = mean(simulated_biases), + sd_bias = sd(simulated_biases), + quantile_2.5 = quantile(simulated_biases, 0.025), + quantile_97.5 = quantile(simulated_biases, 0.975), + simulation_size = n_simulations + ) + + return(simulation_stats) +} + +#' Generate uncertainty report +generate_uncertainty_report <- function(uncertainty_results, output_file = NULL) { + + report <- list() + + for (sensor_name in names(uncertainty_results)) { + sensor_results <- uncertainty_results[[sensor_name]] + + sensor_report <- list( + sensor_name = sensor_name, + sample_size = sensor_results$sample_size, + + # Key uncertainty metrics + combined_standard_uncertainty = sensor_results$combined_uncertainty$combined_standard_uncertainty, + expanded_uncertainty = sensor_results$expanded_uncertainty$expanded_uncertainty, + coverage_factor = sensor_results$expanded_uncertainty$coverage_factor, + + # Type A components + type_a_standard_uncertainty = sensor_results$type_a_uncertainty$standard_uncertainty, + type_a_degrees_of_freedom = sensor_results$type_a_uncertainty$degrees_of_freedom, + + # Type B components + type_b_combined_uncertainty = sensor_results$type_b_uncertainty$combined_uncertainty, + + # Monte Carlo results + monte_carlo_mean_bias = sensor_results$monte_carlo_simulation$mean_bias, + monte_carlo_uncertainty = sensor_results$monte_carlo_simulation$sd_bias + ) + + report[[sensor_name]] <- sensor_report + } + + # Convert to data frame + report_df <- do.call(rbind, lapply(report, as.data.frame)) + + if (!is.null(output_file)) { + write.csv(report_df, output_file, row.names = FALSE) + cat("Uncertainty report saved to:", output_file, "\n") + } + + return(report_df) +} + +#' Calculate measurement uncertainty following ISO GUM +#' +#' This function implements the complete ISO GUM uncertainty calculation procedure +#' @param measurement_result The measured value +#' @param uncertainty_components List of uncertainty components with their values and types +#' @param coverage_probability Desired coverage probability (default: 0.95) +#' @return List containing the complete uncertainty analysis +calculate_iso_gum_uncertainty <- function(measurement_result, uncertainty_components, + coverage_probability = 0.95) { + + # Step 1: Identify all uncertainty sources + sources <- names(uncertainty_components) + + # Step 2: Quantify uncertainty components + standard_uncertainties <- sapply(uncertainty_components, function(comp) { + if (comp$type == "A") { + # Type A: statistical analysis + comp$value / sqrt(comp$n) # Standard error of the mean + } else if (comp$type == "B") { + # Type B: rectangular distribution assumed unless specified + if (!is.null(comp$distribution) && comp$distribution == "normal") { + comp$value + } else { + comp$value / sqrt(3) # Rectangular distribution + } + } else { + comp$value # Default: treat as standard uncertainty + } + }) + + # Step 3: Calculate combined standard uncertainty + combined_uncertainty <- sqrt(sum(standard_uncertainties^2)) + + # Step 4: Calculate expanded uncertainty + # For large degrees of freedom, use coverage factor 2 for 95% confidence + expanded_uncertainty <- 2 * combined_uncertainty + + list( + measurement_result = measurement_result, + standard_uncertainties = standard_uncertainties, + combined_standard_uncertainty = combined_uncertainty, + expanded_uncertainty = expanded_uncertainty, + coverage_factor = 2, + coverage_probability = coverage_probability, + uncertainty_budget = data.frame( + source = sources, + standard_uncertainty = standard_uncertainties + ) + ) +} + +# Example usage and testing +if (FALSE) { + # Create sample data + set.seed(123) + n <- 500 + timestamp <- seq(as.POSIXct("2024-01-01 06:00:00"), by = "1 min", length.out = n) + + # Simulate reference and test sensors + reference <- 800 + 200 * sin(seq(0, 2*pi, length.out = n)) + rnorm(n, 0, 10) + test_sensor <- reference + rnorm(n, 5, 15) # 5 W/m² bias, 15 W/m² random error + + sample_data <- data.frame( + timestamp = timestamp, + reference_sensor = reference, + test_sensor = test_sensor + ) + + # Define sensor specifications + sensor_specs <- list( + test_sensor = list( + calibration_uncertainty = 0.02, # 2% + temperature_uncertainty = 0.005, # 0.5% + cosine_uncertainty = 0.01, # 1% + spectral_uncertainty = 0.015 # 1.5% + ) + ) + + # Calculate uncertainty + uncertainty_results <- calculate_measurement_uncertainty( + data = sample_data, + reference_col = "reference_sensor", + test_cols = "test_sensor", + sensor_specs = sensor_specs, + coverage_factor = 2 + ) + + # Generate report + report <- generate_uncertainty_report(uncertainty_results, "uncertainty_report.csv") + + print("Uncertainty analysis completed!") + print(report) + + # Example of ISO GUM calculation + gum_components <- list( + calibration = list(type = "B", value = 16, distribution = "normal"), # 2% of 800 W/m² + temperature = list(type = "B", value = 4), # 0.5% of 800 W/m² + repeatability = list(type = "A", value = 15, n = 500) + ) + + gum_result <- calculate_iso_gum_uncertainty(805, gum_components) + print("ISO GUM uncertainty calculation:") + print(gum_result) +} diff --git a/src/data_cleaning/validator.py b/src/data_cleaning/validator.py new file mode 100644 index 0000000..3908596 --- /dev/null +++ b/src/data_cleaning/validator.py @@ -0,0 +1,342 @@ +#!/usr/bin/env python3 +""" +Pyranometer Data Validator +Implements quality control and data cleaning for solar irradiance measurements +Based on BSRN quality control procedures and research standards +""" + +import pandas as pd +import numpy as np +from typing import Dict, List, Tuple, Optional +import logging +from datetime import datetime, timedelta + +logger = logging.getLogger(__name__) + +class PyranometerDataValidator: + """Validate and clean pyranometer data using scientific quality control procedures""" + + def __init__(self): + # Physical limits based on BSRN standards and research + self.physical_limits = { + 'irradiance': { + 'min': 0, # W/m² (can be negative for some sensors due to thermal effects) + 'max': 1500, # W/m² (theoretical maximum for clear sky conditions) + 'extremely_rare_max': 1400 # W/m² (rare but possible in high altitude) + }, + 'temperature': { + 'min': -50, # °C + 'max': 80, # °C + 'extremely_rare_min': -40, + 'extremely_rare_max': 70 + }, + 'rate_of_change': { + 'irradiance': 100, # W/m² per minute (maximum plausible rate) + 'temperature': 5 # °C per minute + } + } + + # Sensor-specific characteristics + self.sensor_specs = { + 'kipp_zonen': { + 'response_time': 5, # seconds + 'uncertainty': 2.0, # % at k=2 + 'cosine_error': 1.0 # % + }, + 'apogee': { + 'response_time': 1, # seconds + 'uncertainty': 5.0, # % at k=2 + 'cosine_error': 2.0 # % + }, + 'hukseflux': { + 'response_time': 1, # seconds + 'uncertainty': 2.5, # % at k=2 + 'cosine_error': 1.5 # % + }, + 'generic': { + 'response_time': 10, # seconds + 'uncertainty': 10.0, # % at k=2 + 'cosine_error': 5.0 # % + } + } + + def apply_physical_limits_check(self, data: pd.DataFrame, sensor_info: Dict) -> pd.DataFrame: + """Apply BSRN physical limits quality control""" + df = data.copy() + + for sensor in sensor_info['sensors']: + col = sensor['column_name'] + data_type = sensor['data_type'] + + if data_type == 'irradiance': + # Physical possible limits + mask_physical = (df[col] < self.physical_limits['irradiance']['min']) | \ + (df[col] > self.physical_limits['irradiance']['max']) + + # Extremely rare limits + mask_extreme = df[col] > self.physical_limits['irradiance']['extremely_rare_max'] + + # Add quality flags + df[f'{col}_quality_flag'] = 0 # Default: good + df.loc[mask_extreme, f'{col}_quality_flag'] = 1 # Warning + df.loc[mask_physical, f'{col}_quality_flag'] = 2 # Error + + logger.info(f"Physical limits check for {col}: " + f"{mask_physical.sum()} errors, {mask_extreme.sum()} warnings") + + elif data_type == 'temperature': + mask_physical = (df[col] < self.physical_limits['temperature']['min']) | \ + (df[col] > self.physical_limits['temperature']['max']) + + mask_extreme = (df[col] < self.physical_limits['temperature']['extremely_rare_min']) | \ + (df[col] > self.physical_limits['temperature']['extremely_rare_max']) + + df[f'{col}_quality_flag'] = 0 + df.loc[mask_extreme, f'{col}_quality_flag'] = 1 + df.loc[mask_physical, f'{col}_quality_flag'] = 2 + + logger.info(f"Temperature limits check for {col}: " + f"{mask_physical.sum()} errors, {mask_extreme.sum()} warnings") + + return df + + def detect_outliers_iqr(self, data: pd.DataFrame, sensor_info: Dict) -> pd.DataFrame: + """Detect outliers using Interquartile Range method""" + df = data.copy() + + for sensor in sensor_info['sensors']: + if sensor['data_type'] == 'irradiance': + col = sensor['column_name'] + + Q1 = df[col].quantile(0.25) + Q3 = df[col].quantile(0.75) + IQR = Q3 - Q1 + + lower_bound = Q1 - 1.5 * IQR + upper_bound = Q3 + 1.5 * IQR + + outliers = (df[col] < lower_bound) | (df[col] > upper_bound) + df[f'{col}_outlier'] = outliers + + logger.info(f"IQR outlier detection for {col}: {outliers.sum()} outliers found") + + return df + + def check_rate_of_change(self, data: pd.DataFrame, sensor_info: Dict) -> pd.DataFrame: + """Check for unrealistic rate of change in measurements""" + df = data.copy() + + # Ensure data is sorted by timestamp + df = df.sort_values('timestamp').reset_index(drop=True) + + for sensor in sensor_info['sensors']: + col = sensor['column_name'] + data_type = sensor['data_type'] + + if data_type == 'irradiance': + # Calculate rate of change (W/m² per minute) + time_diff = df['timestamp'].diff().dt.total_seconds() / 60 # minutes + value_diff = df[col].diff() + rate_of_change = value_diff / time_diff + + # Replace infinite values + rate_of_change = rate_of_change.replace([np.inf, -np.inf], np.nan) + + # Flag unrealistic rates + max_rate = self.physical_limits['rate_of_change']['irradiance'] + rate_outliers = rate_of_change.abs() > max_rate + + df[f'{col}_rate_flag'] = rate_outliers + + logger.info(f"Rate of change check for {col}: {rate_outliers.sum()} flags") + + elif data_type == 'temperature': + time_diff = df['timestamp'].diff().dt.total_seconds() / 60 + value_diff = df[col].diff() + rate_of_change = value_diff / time_diff + rate_of_change = rate_of_change.replace([np.inf, -np.inf], np.nan) + + max_rate = self.physical_limits['rate_of_change']['temperature'] + rate_outliers = rate_of_change.abs() > max_rate + + df[f'{col}_rate_flag'] = rate_outliers + + logger.info(f"Temperature rate check for {col}: {rate_outliers.sum()} flags") + + return df + + def detect_missing_data(self, data: pd.DataFrame, sensor_info: Dict) -> pd.DataFrame: + """Detect missing or invalid data points""" + df = data.copy() + + for sensor in sensor_info['sensors']: + col = sensor['column_name'] + + # Check for NaN values + nan_mask = df[col].isna() + + # Check for constant values (potential sensor failure) + if len(df) > 10: # Need sufficient data + rolling_std = df[col].rolling(window=10, min_periods=1).std() + constant_mask = rolling_std == 0 + else: + constant_mask = pd.Series(False, index=df.index) + + df[f'{col}_missing_flag'] = nan_mask | constant_mask + + logger.info(f"Missing data check for {col}: " + f"{nan_mask.sum()} NaN, {constant_mask.sum()} constant values") + + return df + + def apply_sun_position_checks(self, data: pd.DataFrame, location: Dict = None) -> pd.DataFrame: + """Apply sun position-based quality checks (simplified version)""" + df = data.copy() + + # Simplified: check for nighttime values that should be zero + # In a full implementation, you would calculate solar position + df['hour'] = df['timestamp'].dt.hour + + # Assume nighttime is between 8 PM and 5 AM (simplified) + nighttime_mask = (df['hour'] >= 20) | (df['hour'] < 5) + + # For irradiance sensors, nighttime values should be near zero + irradiance_cols = [sensor['column_name'] for sensor in + [s for s in sensor_info['sensors'] if s['data_type'] == 'irradiance']] + + for col in irradiance_cols: + nighttime_high = nighttime_mask & (df[col] > 50) # More than 50 W/m² at night + df[f'{col}_night_flag'] = nighttime_high + + logger.info(f"Nighttime check for {col}: {nighttime_high.sum()} flags") + + return df + + def clean_data(self, data: pd.DataFrame, sensor_info: Dict, + cleaning_strategy: str = 'conservative') -> pd.DataFrame: + """Apply comprehensive data cleaning based on selected strategy""" + df = data.copy() + + logger.info("Starting data cleaning process...") + + # Apply all validation checks + df = self.apply_physical_limits_check(df, sensor_info) + df = self.detect_outliers_iqr(df, sensor_info) + df = self.check_rate_of_change(df, sensor_info) + df = self.detect_missing_data(df, sensor_info) + + # Apply cleaning based on strategy + if cleaning_strategy == 'conservative': + # Only remove clear errors + for sensor in sensor_info['sensors']: + col = sensor['column_name'] + error_mask = df[f'{col}_quality_flag'] == 2 + df.loc[error_mask, col] = np.nan + logger.info(f"Conservative cleaning for {col}: {error_mask.sum()} points removed") + + elif cleaning_strategy == 'moderate': + # Remove errors and some warnings + for sensor in sensor_info['sensors']: + col = sensor['column_name'] + remove_mask = (df[f'{col}_quality_flag'] == 2) | \ + (df[f'{col}_outlier']) | \ + (df[f'{col}_rate_flag']) | \ + (df[f'{col}_missing_flag']) + df.loc[remove_mask, col] = np.nan + logger.info(f"Moderate cleaning for {col}: {remove_mask.sum()} points removed") + + elif cleaning_strategy == 'aggressive': + # Remove all flagged data + for sensor in sensor_info['sensors']: + col = sensor['column_name'] + remove_mask = (df[f'{col}_quality_flag'] > 0) | \ + (df[f'{col}_outlier']) | \ + (df[f'{col}_rate_flag']) | \ + (df[f'{col}_missing_flag']) + df.loc[remove_mask, col] = np.nan + logger.info(f"Aggressive cleaning for {col}: {remove_mask.sum()} points removed") + + # Calculate data quality statistics + quality_stats = self.calculate_quality_statistics(df, sensor_info) + + logger.info("Data cleaning completed") + logger.info(f"Quality statistics: {quality_stats}") + + return df, quality_stats + + def calculate_quality_statistics(self, data: pd.DataFrame, sensor_info: Dict) -> Dict: + """Calculate comprehensive quality statistics""" + stats = {} + + for sensor in sensor_info['sensors']: + col = sensor['column_name'] + total_points = len(data) + + if total_points == 0: + continue + + # Count different flag types + quality_flag_col = f'{col}_quality_flag' + if quality_flag_col in data.columns: + error_count = (data[quality_flag_col] == 2).sum() + warning_count = (data[quality_flag_col] == 1).sum() + good_count = (data[quality_flag_col] == 0).sum() + else: + error_count = warning_count = 0 + good_count = total_points + + outlier_count = data.get(f'{col}_outlier', pd.Series(False)).sum() + rate_flag_count = data.get(f'{col}_rate_flag', pd.Series(False)).sum() + missing_count = data.get(f'{col}_missing_flag', pd.Series(False)).sum() + + # Calculate percentages + stats[col] = { + 'total_points': total_points, + 'good_points': good_count, + 'warning_points': warning_count, + 'error_points': error_count, + 'outlier_points': outlier_count, + 'rate_flag_points': rate_flag_count, + 'missing_points': missing_count, + 'data_quality_percentage': (good_count / total_points * 100) if total_points > 0 else 0, + 'completeness_percentage': ((total_points - missing_count) / total_points * 100) if total_points > 0 else 0 + } + + return stats + +# Example usage +if __name__ == "__main__": + # Create sample data for testing + sample_data = { + 'timestamp': pd.date_range('2024-01-01 06:00:00', '2024-01-01 18:00:00', freq='1min'), + 'sensor_1': np.random.normal(800, 50, 721), + 'sensor_2': np.random.normal(810, 60, 721), + 'temperature': np.random.normal(25, 2, 721) + } + + # Add some outliers and errors + sample_data['sensor_1'][100] = 2000 # Physical limit error + sample_data['sensor_2'][200] = -100 # Physical limit error + sample_data['temperature'][300] = 100 # Temperature error + + df = pd.DataFrame(sample_data) + + sensor_info = { + 'sensors': [ + {'column_name': 'sensor_1', 'data_type': 'irradiance'}, + {'column_name': 'sensor_2', 'data_type': 'irradiance'}, + {'column_name': 'temperature', 'data_type': 'temperature'} + ] + } + + validator = PyranometerDataValidator() + + # Test validation + cleaned_data, quality_stats = validator.clean_data(df, sensor_info, 'moderate') + + print("Data validation completed!") + print(f"Original data points: {len(df)}") + print(f"Cleaned data points: {len(cleaned_data)}") + print("Quality statistics:") + for sensor, stats in quality_stats.items(): + print(f" {sensor}: {stats['data_quality_percentage']:.1f}% good data") diff --git a/src/data_ingestion/file_processor.py b/src/data_ingestion/file_processor.py new file mode 100644 index 0000000..67e5864 --- /dev/null +++ b/src/data_ingestion/file_processor.py @@ -0,0 +1,308 @@ +#!/usr/bin/env python3 +""" +Pyranometer Data File Processor +Handles CSV and TXT files from various dataloggers and sensor types +""" + +import pandas as pd +import numpy as np +import os +import re +import json +from datetime import datetime, timedelta +from pathlib import Path +import logging +from typing import Dict, List, Optional, Tuple + +# Configure logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + +class PyranometerFileProcessor: + """Process pyranometer data files from various formats and sensors""" + + def __init__(self, config_path: Optional[str] = None): + self.supported_formats = ['.csv', '.txt', '.dat'] + self.sensor_patterns = { + 'kipp_zonen': r'(CMP|SMP|SP)[\d]+', + 'apogee': r'(SP|SQ)[\d]+', + 'hukseflux': r'(SR)[\d]+', + 'li_cor': r'(LI-200|LI-190)', + 'eppley': r'(PSP|EPPLEY)' + } + + def detect_file_format(self, file_path: str) -> str: + """Detect the format of the data file""" + file_ext = Path(file_path).suffix.lower() + + if file_ext == '.csv': + return 'csv' + elif file_ext == '.txt': + # Check if it's CSV-like or custom format + with open(file_path, 'r') as f: + first_line = f.readline().strip() + if ',' in first_line: + return 'csv_txt' + else: + return 'custom_txt' + else: + return 'unknown' + + def parse_timestamp(self, timestamp_str: str, file_format: str) -> datetime: + """Parse timestamp from various formats""" + timestamp_formats = [ + '%Y-%m-%d %H:%M:%S', + '%d/%m/%Y %H:%M:%S', + '%m/%d/%Y %H:%M:%S', + '%Y-%m-%dT%H:%M:%S', + '%Y%m%d_%H%M%S', + '%H:%M:%S %d/%m/%Y' + ] + + for fmt in timestamp_formats: + try: + return datetime.strptime(timestamp_str.strip(), fmt) + except ValueError: + continue + + # If no format matches, try to extract from string + logger.warning(f"Could not parse timestamp: {timestamp_str}") + return datetime.now() + + def detect_sensor_type(self, column_name: str) -> Tuple[str, str]: + """Detect sensor type and model from column name""" + column_lower = column_name.lower() + + # Check for common sensor patterns + for sensor_type, pattern in self.sensor_patterns.items(): + if re.search(pattern, column_name, re.IGNORECASE): + return sensor_type, re.search(pattern, column_name, re.IGNORECASE).group() + + # Check for generic patterns + if any(word in column_lower for word in ['irradiance', 'ghi', 'global']): + return 'generic', 'irradiance_sensor' + elif any(word in column_lower for word in ['temp', 'temperature']): + return 'generic', 'temperature_sensor' + elif any(word in column_lower for word in ['volt', 'voltage']): + return 'generic', 'voltage_sensor' + else: + return 'unknown', 'unknown' + + def process_csv_file(self, file_path: str) -> Dict: + """Process CSV file with pyranometer data""" + try: + # Try different encodings + encodings = ['utf-8', 'latin-1', 'windows-1252'] + df = None + + for encoding in encodings: + try: + df = pd.read_csv(file_path, encoding=encoding) + break + except UnicodeDecodeError: + continue + + if df is None: + raise ValueError("Could not read file with any encoding") + + # Clean column names + df.columns = [col.strip().lower().replace(' ', '_') for col in df.columns] + + # Identify timestamp column + timestamp_cols = [col for col in df.columns if any(word in col for word in + ['time', 'date', 'timestamp', 'datetime'])] + + if not timestamp_cols: + raise ValueError("No timestamp column found") + + timestamp_col = timestamp_cols[0] + + # Parse timestamps + df['timestamp'] = df[timestamp_col].apply( + lambda x: self.parse_timestamp(str(x), 'csv') + ) + + # Identify data columns + irradiance_cols = [col for col in df.columns if any(word in col for word in + ['irradiance', 'ghi', 'global', 'w/m2', 'w_m2'])] + + temperature_cols = [col for col in df.columns if any(word in col for word in + ['temp', 'temperature', '°c', 'deg'])] + + voltage_cols = [col for col in df.columns if any(word in col for word in + ['volt', 'voltage', 'mv', 'v'])] + + # Extract sensor information + sensors = [] + for col in irradiance_cols: + sensor_type, model = self.detect_sensor_type(col) + sensors.append({ + 'column_name': col, + 'sensor_type': sensor_type, + 'model': model, + 'data_type': 'irradiance', + 'units': 'W/m²' + }) + + for col in temperature_cols: + sensor_type, model = self.detect_sensor_type(col) + sensors.append({ + 'column_name': col, + 'sensor_type': sensor_type, + 'model': model, + 'data_type': 'temperature', + 'units': '°C' + }) + + for col in voltage_cols: + sensor_type, model = self.detect_sensor_type(col) + sensors.append({ + 'column_name': col, + 'sensor_type': sensor_type, + 'model': model, + 'data_type': 'voltage', + 'units': 'V' + }) + + result = { + 'file_path': file_path, + 'file_format': 'csv', + 'timestamp_column': timestamp_col, + 'sensors': sensors, + 'data': df, + 'start_time': df['timestamp'].min(), + 'end_time': df['timestamp'].max(), + 'sampling_interval': self.calculate_sampling_interval(df['timestamp']), + 'total_measurements': len(df) + } + + logger.info(f"Processed CSV file: {file_path}") + logger.info(f"Found {len(sensors)} sensors, {len(df)} measurements") + + return result + + except Exception as e: + logger.error(f"Error processing CSV file {file_path}: {str(e)}") + raise + + def process_txt_file(self, file_path: str) -> Dict: + """Process TXT file with pyranometer data""" + try: + # Read file to determine format + with open(file_path, 'r') as f: + lines = f.readlines() + + # Check if it's CSV-like format + if ',' in lines[0]: + return self.process_csv_file(file_path) + + # Handle custom TXT format (space or tab separated) + # This is a simplified version - you may need to customize based on your specific format + df = pd.read_csv(file_path, sep='\s+', engine='python') + + # Similar processing as CSV + df.columns = [f'col_{i}' for i in range(len(df.columns))] + + # Assume first column is timestamp, others are data + timestamp_col = 'col_0' + df['timestamp'] = df[timestamp_col].apply( + lambda x: self.parse_timestamp(str(x), 'txt') + ) + + sensors = [] + for i, col in enumerate(df.columns[1:], 1): # Skip timestamp column + sensors.append({ + 'column_name': col, + 'sensor_type': 'unknown', + 'model': f'sensor_{i}', + 'data_type': 'irradiance', # Assume irradiance by default + 'units': 'W/m²' + }) + + result = { + 'file_path': file_path, + 'file_format': 'txt', + 'timestamp_column': timestamp_col, + 'sensors': sensors, + 'data': df, + 'start_time': df['timestamp'].min(), + 'end_time': df['timestamp'].max(), + 'sampling_interval': self.calculate_sampling_interval(df['timestamp']), + 'total_measurements': len(df) + } + + logger.info(f"Processed TXT file: {file_path}") + logger.info(f"Found {len(sensors)} sensors, {len(df)} measurements") + + return result + + except Exception as e: + logger.error(f"Error processing TXT file {file_path}: {str(e)}") + raise + + def calculate_sampling_interval(self, timestamps: pd.Series) -> float: + """Calculate average sampling interval in seconds""" + if len(timestamps) < 2: + return 0 + + time_diffs = [] + for i in range(1, len(timestamps)): + diff = (timestamps.iloc[i] - timestamps.iloc[i-1]).total_seconds() + time_diffs.append(diff) + + return np.mean(time_diffs) if time_diffs else 0 + + def process_file(self, file_path: str) -> Dict: + """Main method to process any supported file""" + if not os.path.exists(file_path): + raise FileNotFoundError(f"File not found: {file_path}") + + file_format = self.detect_file_format(file_path) + + if file_format in ['csv', 'csv_txt']: + return self.process_csv_file(file_path) + elif file_format == 'custom_txt': + return self.process_txt_file(file_path) + else: + raise ValueError(f"Unsupported file format: {file_format}") + + def batch_process_files(self, directory_path: str) -> List[Dict]: + """Process all supported files in a directory""" + processed_files = [] + + for file_path in Path(directory_path).glob('*'): + if file_path.suffix.lower() in self.supported_formats: + try: + result = self.process_file(str(file_path)) + processed_files.append(result) + except Exception as e: + logger.error(f"Failed to process {file_path}: {str(e)}") + continue + + return processed_files + +# Example usage +if __name__ == "__main__": + processor = PyranometerFileProcessor() + + # Test with a sample file + sample_data = { + 'timestamp': ['2024-01-01 10:00:00', '2024-01-01 10:01:00', '2024-01-01 10:02:00'], + 'kipp_zonen_cmp11': [850.5, 852.1, 851.8], + 'apogee_sp110': [848.2, 849.5, 850.1], + 'reference_sensor': [851.0, 852.5, 852.0], + 'temperature': [25.3, 25.4, 25.5] + } + + df = pd.DataFrame(sample_data) + df.to_csv('sample_pyranometer_data.csv', index=False) + + # Process the sample file + try: + result = processor.process_file('sample_pyranometer_data.csv') + print("File processed successfully!") + print(f"Sensors found: {len(result['sensors'])}") + print(f"Time range: {result['start_time']} to {result['end_time']}") + print(f"Sampling interval: {result['sampling_interval']} seconds") + except Exception as e: + print(f"Error: {e}") diff --git a/src/database/schema.sql b/src/database/schema.sql new file mode 100644 index 0000000..6a5dcd7 --- /dev/null +++ b/src/database/schema.sql @@ -0,0 +1,196 @@ +-- Pyranometer Data Analysis System Database Schema +-- Supports multiple sensor types, calibration sessions, and advanced analysis + +-- Sensor metadata table +CREATE TABLE IF NOT EXISTS sensors ( + sensor_id TEXT PRIMARY KEY, + manufacturer TEXT NOT NULL, + model TEXT NOT NULL, + serial_number TEXT UNIQUE, + sensor_type TEXT CHECK(sensor_type IN ('thermopile', 'photodiode', 'reference', 'test')), + classification TEXT CHECK(classification IN ('A', 'B', 'C', 'Secondary', 'Primary')), + calibration_date DATE, + calibration_certificate TEXT, + uncertainty_value REAL, -- Expanded uncertainty at k=2 + uncertainty_units TEXT DEFAULT 'W/m²', + sensitivity REAL, -- Sensor sensitivity in μV/W/m² + response_time REAL, -- Response time in seconds + spectral_range_min REAL, -- nm + spectral_range_max REAL, -- nm + temperature_coefficient REAL, + cosine_correction_factor REAL, + azimuth_correction_factor REAL, + zero_offset_a REAL, + zero_offset_b REAL, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP +); + +-- Calibration sessions table +CREATE TABLE IF NOT EXISTS calibration_sessions ( + session_id TEXT PRIMARY KEY, + session_date DATE NOT NULL, + location TEXT, + operator TEXT, + reference_sensor_id TEXT REFERENCES sensors(sensor_id), + weather_conditions TEXT, + temperature REAL, + humidity REAL, + atmospheric_pressure REAL, + notes TEXT, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP +); + +-- Raw measurement data table +CREATE TABLE IF NOT EXISTS measurements ( + measurement_id INTEGER PRIMARY KEY AUTOINCREMENT, + session_id TEXT REFERENCES calibration_sessions(session_id), + timestamp TIMESTAMP NOT NULL, + sensor_id TEXT REFERENCES sensors(sensor_id), + irradiance REAL NOT NULL, -- W/m² + temperature REAL, -- °C + voltage REAL, -- mV or V + quality_flag INTEGER DEFAULT 0, -- 0=good, 1=warning, 2=error + outlier_flag BOOLEAN DEFAULT FALSE, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP +); + +-- Statistical analysis results table +CREATE TABLE IF NOT EXISTS statistical_results ( + result_id INTEGER PRIMARY KEY AUTOINCREMENT, + session_id TEXT REFERENCES calibration_sessions(session_id), + sensor_id TEXT REFERENCES sensors(sensor_id), + analysis_type TEXT NOT NULL, + period_start TIMESTAMP, + period_end TIMESTAMP, + + -- Basic statistics + n_observations INTEGER, + mean_value REAL, + median_value REAL, + std_deviation REAL, + min_value REAL, + max_value REAL, + q1_value REAL, + q3_value REAL, + + -- Advanced metrics + mbe REAL, -- Mean Bias Error + rmse REAL, -- Root Mean Square Error + mae REAL, -- Mean Absolute Error + r_squared REAL, + correlation_coefficient REAL, + theil_u REAL, + + -- Uncertainty metrics + type_a_uncertainty REAL, + type_b_uncertainty REAL, + combined_uncertainty REAL, + expanded_uncertainty REAL, + coverage_factor REAL DEFAULT 2.0, + + -- Performance indicators + performance_ratio REAL, + clear_sky_index REAL, + variability_index REAL, + + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP +); + +-- Sensor comparison results table +CREATE TABLE IF NOT EXISTS comparison_results ( + comparison_id INTEGER PRIMARY KEY AUTOINCREMENT, + session_id TEXT REFERENCES calibration_sessions(session_id), + test_sensor_id TEXT REFERENCES sensors(sensor_id), + reference_sensor_id TEXT REFERENCES sensors(sensor_id), + + -- Comparison metrics + bias REAL, + relative_bias REAL, + mbe REAL, + relative_mbe REAL, + rmse REAL, + relative_rmse REAL, + mae REAL, + relative_mae REAL, + r_squared REAL, + slope REAL, + intercept REAL, + + -- Statistical tests + ks_statistic REAL, -- Kolmogorov-Smirnov + ks_p_value REAL, + mw_statistic REAL, -- Mann-Whitney + mw_p_value REAL, + + -- Uncertainty comparison + combined_uncertainty REAL, + coverage_probability REAL, + + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP +); + +-- Metrology chain table +CREATE TABLE IF NOT EXISTS metrology_chain ( + chain_id INTEGER PRIMARY KEY AUTOINCREMENT, + session_id TEXT REFERENCES calibration_sessions(session_id), + instrument_sequence TEXT NOT NULL, -- JSON array of instrument IDs in order + calibration_dates TEXT, -- JSON array of calibration dates + uncertainty_contributions TEXT, -- JSON with uncertainty breakdown + traceability_statement TEXT, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP +); + +-- Quality control flags table +CREATE TABLE IF NOT EXISTS quality_flags ( + flag_id INTEGER PRIMARY KEY AUTOINCREMENT, + session_id TEXT REFERENCES calibration_sessions(session_id), + timestamp TIMESTAMP, + sensor_id TEXT REFERENCES sensors(sensor_id), + flag_type TEXT CHECK(flag_type IN ('physical_limit', 'extremely_rare', 'comparison', 'consistency')), + flag_value INTEGER, -- 0=good, 1=warning, 2=error + description TEXT, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP +); + +-- Indexes for performance +CREATE INDEX IF NOT EXISTS idx_measurements_timestamp ON measurements(timestamp); +CREATE INDEX IF NOT EXISTS idx_measurements_sensor_id ON measurements(sensor_id); +CREATE INDEX IF NOT EXISTS idx_measurements_session_id ON measurements(session_id); +CREATE INDEX IF NOT EXISTS idx_statistical_session_id ON statistical_results(session_id); +CREATE INDEX IF NOT EXISTS idx_comparison_session_id ON comparison_results(session_id); + +-- Views for common queries +CREATE VIEW IF NOT EXISTS sensor_performance_summary AS +SELECT + s.sensor_id, + s.manufacturer, + s.model, + COUNT(DISTINCT cr.session_id) as calibration_count, + AVG(cr.bias) as avg_bias, + AVG(cr.rmse) as avg_rmse, + AVG(cr.r_squared) as avg_r_squared +FROM sensors s +LEFT JOIN comparison_results cr ON s.sensor_id = cr.test_sensor_id +GROUP BY s.sensor_id, s.manufacturer, s.model; + +CREATE VIEW IF NOT EXISTS session_statistics AS +SELECT + cs.session_id, + cs.session_date, + cs.location, + COUNT(DISTINCT m.sensor_id) as sensor_count, + MIN(m.timestamp) as start_time, + MAX(m.timestamp) as end_time, + COUNT(m.measurement_id) as total_measurements +FROM calibration_sessions cs +JOIN measurements m ON cs.session_id = m.session_id +GROUP BY cs.session_id, cs.session_date, cs.location; + +-- Triggers for updated_at timestamps +CREATE TRIGGER IF NOT EXISTS update_sensors_timestamp +AFTER UPDATE ON sensors +FOR EACH ROW +BEGIN + UPDATE sensors SET updated_at = CURRENT_TIMESTAMP WHERE sensor_id = NEW.sensor_id; +END; diff --git a/src/reporting/report_generator.R b/src/reporting/report_generator.R new file mode 100644 index 0000000..3eb6bc0 --- /dev/null +++ b/src/reporting/report_generator.R @@ -0,0 +1,343 @@ +# Automated Report Generator for Pyranometer Data Analysis +# Creates comprehensive PDF reports with statistical analysis, uncertainty calculations, and visualizations + +# Load required libraries +library(rmarkdown) +library(knitr) +library(ggplot2) +library(gridExtra) +library(scales) +library(lubridate) +library(dplyr) +library(tidyr) + +#' Generate comprehensive pyranometer analysis report +#' +#' @param data Processed data frame with sensor measurements +#' @param statistical_results Results from statistical analysis +#' @param uncertainty_results Results from uncertainty analysis +#' @param sensor_info Information about sensors and calibration +#' @param output_dir Directory to save the report +#' @param report_title Title for the report +#' @param author_name Name of the report author +#' @param institution_name Name of the institution +#' @return Path to the generated PDF report +generate_pyranometer_report <- function(data, statistical_results, uncertainty_results, + sensor_info, output_dir = "reports", + report_title = "Pyranometer Calibration Analysis Report", + author_name = "Automated Analysis System", + institution_name = "Solar Metrology Laboratory") { + + # Create output directory if it doesn't exist + if (!dir.exists(output_dir)) { + dir.create(output_dir, recursive = TRUE) + } + + # Generate timestamp for report filename + timestamp <- format(Sys.time(), "%Y%m%d_%H%M%S") + report_filename <- paste0("pyranometer_report_", timestamp, ".pdf") + report_path <- file.path(output_dir, report_filename) + + # Create R Markdown content + rmd_content <- create_rmd_content(data, statistical_results, uncertainty_results, + sensor_info, report_title, author_name, institution_name) + + # Write R Markdown file + rmd_file <- tempfile(fileext = ".Rmd") + writeLines(rmd_content, rmd_file) + + # Render PDF report + cat("Generating PDF report...\n") + render(rmd_file, output_file = report_path, output_format = "pdf_document") + + # Clean up temporary file + file.remove(rmd_file) + + cat("Report generated successfully:", report_path, "\n") + return(report_path) +} + +#' Create R Markdown content for the report +create_rmd_content <- function(data, statistical_results, uncertainty_results, + sensor_info, title, author, institution) { + + rmd_content <- c( + "---", + paste("title:", shQuote(title)), + paste("author:", shQuote(author)), + paste("institution:", shQuote(institution)), + paste("date:", shQuote(format(Sys.time(), "%B %d, %Y"))), + "output: pdf_document", + "geometry: margin=1in", + "header-includes:", + " - \\usepackage{booktabs}", + " - \\usepackage{longtable}", + " - \\usepackage{array}", + " - \\usepackage{multirow}", + " - \\usepackage{wrapfig}", + " - \\usepackage{float}", + " - \\usepackage{colortbl}", + " - \\usepackage{pdflscape}", + " - \\usepackage{tabu}", + " - \\usepackage{threeparttable}", + "---", + "", + "```{r setup, include=FALSE}", + "knitr::opts_chunk$set(echo = FALSE, warning = FALSE, message = FALSE, fig.align = 'center')", + "library(ggplot2)", + "library(dplyr)", + "library(tidyr)", + "library(scales)", + "library(gridExtra)", + "library(lubridate)", + "```", + "", + "# Executive Summary", + "", + "This report presents the comprehensive analysis of pyranometer calibration data collected during the measurement session. The analysis includes statistical comparison of test sensors against the reference standard, uncertainty calculations following ISO GUM guidelines, and performance assessment of each sensor.", + "", + "## Key Findings", + "", + "```{r executive-summary, echo=FALSE}", + "# Generate executive summary table", + "exec_summary <- data.frame()", + "for (sensor_name in names(statistical_results)) {", + " stats <- statistical_results[[sensor_name]]", + " uncertainty <- uncertainty_results[[sensor_name]]", + " ", + " summary_row <- data.frame(", + " Sensor = sensor_name,", + " MBE = round(stats$error_metrics$mbe, 2),", + " RMSE = round(stats$error_metrics$rmse, 2),", + " R_squared = round(stats$distribution_analysis$r_squared, 3),", + " Expanded_Uncertainty = round(uncertainty$expanded_uncertainty$expanded_uncertainty, 2),", + " Performance_Ratio = round(stats$performance_indicators$performance_ratio, 3)", + " )", + " exec_summary <- rbind(exec_summary, summary_row)", + "}", + "knitr::kable(exec_summary, caption = 'Executive Summary: Key Performance Metrics')", + "```", + "", + "# 1. Introduction", + "", + "## 1.1 Measurement Objectives", + "This calibration session aims to:", + "- Compare the performance of multiple pyranometers against a reference standard", + "- Calculate measurement uncertainties following international standards", + "- Assess the long-term stability and reliability of each sensor", + "- Provide traceable calibration results for quality assurance", + "", + "## 1.2 Measurement Setup", + "```{r setup-info, echo=FALSE}", + "# Display measurement setup information", + "setup_info <- data.frame(", + " Parameter = c('Reference Sensor', 'Number of Test Sensors', 'Measurement Duration',", + " 'Sampling Interval', 'Total Measurements'),", + " Value = c(sensor_info$reference_sensor, length(names(statistical_results)),", + " paste(round(difftime(max(data$timestamp), min(data$timestamp), units = 'hours'), 'hours'),", + " paste(round(mean(diff(data$timestamp)), 'seconds'),", + " nrow(data))", + ")", + "knitr::kable(setup_info, caption = 'Measurement Setup Parameters')", + "```", + "", + "# 2. Data Quality Assessment", + "", + "## 2.1 Data Completeness", + "```{r data-quality, echo=FALSE}", + "# Calculate data completeness", + "completeness_stats <- data.frame()", + "for (sensor_name in names(statistical_results)) {", + " complete_cases <- sum(complete.cases(data[[sensor_name]], data[[sensor_info$reference_sensor]]))", + " total_cases <- nrow(data)", + " completeness_pct <- round(complete_cases / total_cases * 100, 1)", + " ", + " quality_row <- data.frame(", + " Sensor = sensor_name,", + " Complete_Measurements = complete_cases,", + " Total_Measurements = total_cases,", + " Completeness = paste0(completeness_pct, '%')", + " )", + " completeness_stats <- rbind(completeness_stats, quality_row)", + "}", + "knitr::kable(completeness_stats, caption = 'Data Completeness Analysis')", + "```", + "", + "## 2.2 Time Series Visualization", + "```{r time-series, echo=FALSE, fig.height=6, fig.width=10}", + "# Create time series plot", + "p1 <- ggplot(data, aes(x = timestamp)) +", + " geom_line(aes(y = !!sym(sensor_info$reference_sensor), color = 'Reference'), alpha = 0.7) +", + " labs(title = 'Solar Irradiance Time Series',", + " x = 'Time', y = 'Irradiance (W/m²)', color = 'Sensor') +", + " theme_minimal() +", + " scale_color_manual(values = c('Reference' = 'blue'))", + "", + "colors <- c('red', 'green', 'purple', 'orange', 'brown')", + "color_idx <- 1", + "for (sensor_name in names(statistical_results)) {", + " p1 <- p1 + geom_line(aes(y = !!sym(sensor_name), color = sensor_name), alpha = 0.5)", + " color_idx <- color_idx + 1", + "}", + "print(p1)", + "```", + "", + "# 3. Statistical Analysis", + "", + "## 3.1 Basic Descriptive Statistics", + "```{r descriptive-stats, echo=FALSE}", + "# Create descriptive statistics table", + "desc_stats <- data.frame()", + "for (sensor_name in names(statistical_results)) {", + " stats <- statistical_results[[sensor_name]]$basic_statistics", + " ", + " desc_row <- data.frame(", + " Sensor = sensor_name,", + " Mean = round(stats$test_mean, 1),", + " Median = round(stats$test_median, 1),", + " SD = round(stats$test_sd, 1),", + " Min = round(stats$test_min, 1),", + " Max = round(stats$test_max, 1),", + " IQR = round(stats$test_iqr, 1)", + " )", + " desc_stats <- rbind(desc_stats, desc_row)", + "}", + "knitr::kable(desc_stats, caption = 'Descriptive Statistics for Test Sensors')", + "```", + "", + "## 3.2 Error Metrics and Performance Indicators", + "```{r error-metrics, echo=FALSE}", + "# Create error metrics table", + "error_table <- data.frame()", + "for (sensor_name in names(statistical_results)) {", + " errors <- statistical_results[[sensor_name]]$error_metrics", + " performance <- statistical_results[[sensor_name]]$performance_indicators", + " ", + " error_row <- data.frame(", + " Sensor = sensor_name,", + " MBE = round(errors$mbe, 2),", + " MAE = round(errors$mae, 2),", + " RMSE = round(errors$rmse, 2),", + " R_squared = round(statistical_results[[sensor_name]]$distribution_analysis$r_squared, 3),", + " Performance_Ratio = round(performance$performance_ratio, 3),", + " Variability_Index = round(performance$variability_index, 3)", + " )", + " error_table <- rbind(error_table, error_row)", + "}", + "knitr::kable(error_table, caption = 'Error Metrics and Performance Indicators')", + "```", + "", + "## 3.3 Correlation Analysis", + "```{r correlation-plot, echo=FALSE, fig.height=6, fig.width=8}", + "# Create correlation scatter plots", + "plot_list <- list()", + "for (sensor_name in names(statistical_results)[1:min(4, length(statistical_results))]) {", + " p <- ggplot(data, aes(x = !!sym(sensor_info$reference_sensor), y = !!sym(sensor_name))) +", + " geom_point(alpha = 0.3, size = 0.5) +", + " geom_smooth(method = 'lm', se = TRUE, color = 'red') +", + " labs(title = paste('Correlation:', sensor_name),", + " x = 'Reference (W/m²)', y = paste(sensor_name, '(W/m²)')) +", + " theme_minimal()", + " plot_list[[sensor_name]] <- p", + "}", + "grid.arrange(grobs = plot_list, ncol = 2)", + "```", + "", + "# 4. Uncertainty Analysis", + "", + "## 4.1 Measurement Uncertainty Budget", + "```{r uncertainty-budget, echo=FALSE}", + "# Create uncertainty budget table", + "uncertainty_table <- data.frame()", + "for (sensor_name in names(uncertainty_results)) {", + " uncertainty <- uncertainty_results[[sensor_name]]", + " ", + " uncertainty_row <- data.frame(", + " Sensor = sensor_name,", + " Type_A_Uncertainty = round(uncertainty$type_a_uncertainty$standard_uncertainty, 2),", + " Type_B_Uncertainty = round(uncertainty$type_b_uncertainty$combined_uncertainty, 2),", + " Combined_Uncertainty = round(uncertainty$combined_uncertainty$combined_standard_uncertainty, 2),", + " Expanded_Uncertainty = round(uncertainty$expanded_uncertainty$expanded_uncertainty, 2),", + " Coverage_Factor = uncertainty$coverage_factor", + " )", + " uncertainty_table <- rbind(uncertainty_table, uncertainty_row)", + "}", + "knitr::kable(uncertainty_table, caption = 'Measurement Uncertainty Analysis')", + "```", + "", + "## 4.2 Uncertainty Contributions", + "```{r uncertainty-pie, echo=FALSE, fig.height=6, fig.width=8}", + "# Create uncertainty contribution plot", + "if (length(uncertainty_results) > 0) {", + " sensor_name <- names(uncertainty_results)[1]", + " uncertainty <- uncertainty_results[[sensor_name]]", + " ", + " type_a_contrib <- uncertainty$type_a_uncertainty$standard_uncertainty^2", + " type_b_contrib <- uncertainty$type_b_uncertainty$combined_uncertainty^2", + " total_variance <- type_a_contrib + type_b_contrib", + " ", + " contrib_data <- data.frame(", + " Source = c('Type A (Random)', 'Type B (Systematic)'),", + " Contribution = c(type_a_contrib/total_variance * 100, type_b_contrib/total_variance * 100)", + " )", + " ", + " p <- ggplot(contrib_data, aes(x = '', y = Contribution, fill = Source)) +", + " geom_bar(stat = 'identity', width = 1) +", + " coord_polar('y', start = 0) +", + " labs(title = paste('Uncertainty Contribution for', sensor_name),", + " fill = 'Uncertainty Type') +", + " theme_void() +", + " geom_text(aes(label = paste0(round(Contribution, 1), '%')),", + " position = position_stack(vjust = 0.5))", + " print(p)", + "}", + "```", + "", + "# 5. Statistical Tests and Distribution Analysis", + "", + "## 5.1 Hypothesis Testing Results", + "```{r statistical-tests, echo=FALSE}", + "# Create statistical tests table", + "test_table <- data.frame()", + "for (sensor_name in names(statistical_results)) {", + " tests <- statistical_results[[sensor_name]]$statistical_tests", + " ", + " test_row <- data.frame(", + " Sensor = sensor_name,", + " KS_Statistic = round(tests$ks_statistic, 4),", + " KS_p_value = format.pval(tests$ks_p_value, digits = 3),", + " MW_Statistic = round(tests$mw_statistic, 1),", + " MW_p_value = format.pval(tests$mw_p_value, digits = 3),", + " Normality_p_value = format.pval(statistical_results[[sensor_name]]$distribution_analysis$errors_normality_p, digits = 3)", + " )", + " test_table <- rbind(test_table, test_row)", + "}", + "knitr::kable(test_table, caption = 'Statistical Hypothesis Test Results')", + "```", + "", + "## 5.2 Error Distribution", + "```{r error-distribution, echo=FALSE, fig.height=6, fig.width=10}", + "# Create error distribution plots", + "plot_list <- list()", + "for (sensor_name in names(statistical_results)[1:min(4, length(statistical_results))]) {", + " errors <- data[[sensor_name]] - data[[sensor_info$reference_sensor]]", + " error_data <- data.frame(Errors = errors[!is.na(errors)])", + " ", + " p <- ggplot(error_data, aes(x = Errors)) +", + " geom_histogram(aes(y = ..density..), bins = 30, fill = 'lightblue', alpha = 0.7) +", + " geom_density(color = 'red', size = 1) +", + " geom_vline(xintercept = mean(errors, na.rm = TRUE), color = 'blue', linetype = 'dashed', size = 1) +", + " labs(title = paste('Error Distribution:', sensor_name),", + " x = 'Error (W/m²)', y = 'Density') +", + " theme_minimal()", + " plot_list[[sensor_name]] <- p", + "}", + "grid.arrange(grobs = plot_list, ncol = 2)", + "```", + "", + "# 6. Conclusions and Recommendations", + "", + "## 6.1 Performance Classification", + "Based on the analysis results, sensors can be classified as follows:", + "", + "```{r classification, echo