diff --git a/DIMENSIONALITY_REDUCTION.md b/DIMENSIONALITY_REDUCTION.md new file mode 100644 index 0000000..4a582f0 --- /dev/null +++ b/DIMENSIONALITY_REDUCTION.md @@ -0,0 +1,236 @@ +# Dimensionality Reduction for CatUmap + +This document describes the additional dimensionality reduction techniques available in the CatUmap repository beyond the existing UMAP implementation. + +## Overview + +The new dimensionality reduction tools can process the `outputRaw` files generated by `main.py` and apply various techniques to collapse millions of geospatial features into meaningful clusters and 2D/3D visualizations. + +## Available Techniques + +### 1. Principal Component Analysis (PCA) +- **Type**: Linear dimensionality reduction +- **Best for**: Fast processing, interpretable components, initial data exploration +- **Speed**: Very fast, scales well to millions of points +- **Use case**: Understanding variance structure in your data + +### 2. t-Distributed Stochastic Neighbor Embedding (t-SNE) +- **Type**: Non-linear dimensionality reduction +- **Best for**: High-quality visualizations, revealing local structure +- **Speed**: Slower, recommended to sample large datasets +- **Use case**: Creating publication-quality scatter plots + +### 3. Truncated Singular Value Decomposition (SVD) +- **Type**: Linear dimensionality reduction +- **Best for**: Very large datasets, fast processing +- **Speed**: Fastest method available +- **Use case**: Initial exploration of massive datasets + +### 4. Independent Component Analysis (ICA) +- **Type**: Linear dimensionality reduction +- **Best for**: Finding independent source signals +- **Speed**: Fast +- **Use case**: When you expect independent underlying processes + +## Quick Start + +### Basic Usage + +```bash +# Apply PCA and SVD to lat, lon, Speed columns +python3 dimensionality_reduction.py \ + --input output/raw.tsv.gz \ + --output output/pca_svd_results.tsv.gz \ + --columns lat lon Speed \ + --methods pca svd \ + --standardize + +# Apply t-SNE with sampling for large datasets +python3 dimensionality_reduction.py \ + --input output/raw.tsv.gz \ + --output output/tsne_results.tsv.gz \ + --columns lat lon Speed Accuracy \ + --methods tsne \ + --sample 10000 \ + --standardize +``` + +### Generate Visualizations + +```bash +# Create plots for PCA and SVD results +python3 plot_dim_reduction.py \ + --input output/pca_svd_results.tsv.gz \ + --output pca_svd_plots.png \ + --methods pca svd \ + --color_by Activity +``` + +### Complete Example + +```bash +# Run the comprehensive example +python3 example_usage.py +``` + +## Command Line Options + +### dimensionality_reduction.py + +**Input/Output:** +- `--input`: Input TSV.gz file (outputRaw from main.py) +- `--output`: Output TSV.gz file with results + +**Methods:** +- `--methods`: Choose from `pca`, `tsne`, `svd`, `ica` (can specify multiple) +- `--components`: Number of dimensions to reduce to (default: 2) + +**Data Processing:** +- `--columns`: Columns to use for reduction (default: lat, lon, Speed) +- `--standardize`: Standardize columns before processing (recommended) +- `--sample`: Sample N rows for faster computation + +**t-SNE Parameters:** +- `--tsne_perplexity`: Perplexity parameter (default: 30) +- `--tsne_learning_rate`: Learning rate (default: 200) +- `--tsne_max_iter`: Maximum iterations (default: 1000) + +**Clustering:** +- `--add_clusters`: Add K-means clustering results +- `--n_clusters`: Number of clusters (default: 8) + +### plot_dim_reduction.py + +- `--input`: Input TSV.gz file with dimensionality reduction results +- `--output`: Output plot file (.png) +- `--methods`: Methods to plot +- `--color_by`: Column to use for coloring points +- `--figsize`: Figure size (width, height) + +## Performance Recommendations + +### For Large Datasets (>100K points) + +1. **Start with linear methods**: Use PCA or SVD first for fast exploration +2. **Sample for t-SNE**: Use `--sample 10000` or similar for t-SNE +3. **Use standardization**: Always use `--standardize` for mixed-scale data +4. **Batch processing**: Process subsets of your data separately + +### Method Selection Guide + +| Dataset Size | Primary Goal | Recommended Method | Notes | +|-------------|--------------|------------------|-------| +| <10K points | Visualization | t-SNE | High quality plots | +| 10K-100K | Exploration | PCA + SVD | Fast, interpretable | +| 100K-1M | Fast clustering | SVD + clustering | Very fast | +| >1M points | Initial exploration | PCA with sampling | Use sampling | + +## Input Data Format + +The script expects TSV.gz files with columns including: +- `lat`, `lon`: Geographic coordinates +- `Speed`: Movement speed +- `Activity`: Activity type (optional, good for coloring) +- `Name`: Entity identifier (optional, good for coloring) +- Any other numeric columns of interest + +## Output Format + +The output files contain: +- All original columns +- New columns for each method: `pca_0`, `pca_1`, `tsne_0`, `tsne_1`, etc. +- Standardized columns (if `--standardize` used): `lat_standardized`, etc. +- Clustering results (if requested): `kmeans_cluster` + +## Integration with Existing Pipeline + +This tool is designed to work with the existing CatUmap pipeline: + +1. **Generate raw data** with `main.py --outputRaw output/raw.tsv.gz` +2. **Apply dimensionality reduction** with `dimensionality_reduction.py` +3. **Create visualizations** with `plot_dim_reduction.py` or use the existing R scripts +4. **Further analysis** in R using the existing plotting infrastructure + +## Examples + +### Example 1: Fast Linear Methods +```bash +python3 dimensionality_reduction.py \ + --input output/raw.tsv.gz \ + --output output/linear_analysis.tsv.gz \ + --columns lat lon Speed Accuracy Elevation \ + --methods pca svd \ + --components 3 \ + --standardize \ + --add_clusters --n_clusters 5 +``` + +### Example 2: High-Quality Visualization +```bash +# Sample data for t-SNE +python3 dimensionality_reduction.py \ + --input output/raw.tsv.gz \ + --output output/tsne_visualization.tsv.gz \ + --columns lat lon Speed \ + --methods tsne \ + --sample 5000 \ + --standardize \ + --tsne_perplexity 50 \ + --tsne_learning_rate 200 + +# Create visualization +python3 plot_dim_reduction.py \ + --input output/tsne_visualization.tsv.gz \ + --output tsne_activity_plot.png \ + --methods tsne \ + --color_by Activity +``` + +### Example 3: Comprehensive Analysis +```bash +# Apply multiple methods +python3 dimensionality_reduction.py \ + --input output/raw.tsv.gz \ + --output output/comprehensive.tsv.gz \ + --columns lat lon Speed Accuracy \ + --methods pca svd ica \ + --standardize \ + --add_clusters --n_clusters 8 + +# Visualize results +python3 plot_dim_reduction.py \ + --input output/comprehensive.tsv.gz \ + --output comprehensive_plots.png \ + --methods pca svd ica \ + --color_by Name +``` + +## Troubleshooting + +### Common Issues + +1. **Memory errors with t-SNE**: Use `--sample` to reduce dataset size +2. **Poor clustering results**: Try `--standardize` and experiment with different columns +3. **Plots look crowded**: Reduce point size or increase figure size +4. **Methods not found**: Check that column names match your data + +### Performance Tips + +1. Use SVD for initial exploration of very large datasets +2. Apply PCA first to reduce dimensions before t-SNE +3. Experiment with different column combinations +4. Use clustering results to identify interesting subgroups + +## Dependencies + +- pandas +- numpy +- scikit-learn +- matplotlib +- seaborn +- geopandas (for compatibility with main.py) + +Install with: +```bash +pip install pandas numpy scikit-learn matplotlib seaborn geopandas +``` \ No newline at end of file diff --git a/R/dim_reduction_plot.R b/R/dim_reduction_plot.R new file mode 100644 index 0000000..0335bc3 --- /dev/null +++ b/R/dim_reduction_plot.R @@ -0,0 +1,160 @@ +#!/usr/bin/env Rscript +library(data.table) +library(ggplot2) +library(optparse) +library(RColorBrewer) +library(gridExtra) + +# Parse command line arguments +option_list = list( + make_option( + c("-i", "--input"), + type = "character", + default = "../output/test_all_methods.tsv.gz", + help = "input file with dimensionality reduction results" + ), + make_option( + c("-o", "--output"), + type = "character", + default = "../output/dim_reduction_plots.png", + help = "output plot file" + ), + make_option( + c("-m", "--methods"), + type = "character", + default = "pca,svd,ica,tsne", + help = "comma-separated list of methods to plot" + ), + make_option( + c("-c", "--color_by"), + type = "character", + default = "Activity", + help = "column to use for coloring points" + ), + make_option( + c("-a", "--alpha"), + type = "numeric", + default = 0.6, + help = "point transparency" + ), + make_option( + c("-s", "--point_size"), + type = "numeric", + default = 0.8, + help = "point size" + ), + make_option( + c("-w", "--width"), + type = "numeric", + default = 16, + help = "plot width in inches" + ), + make_option( + c("-h", "--height"), + type = "numeric", + default = 12, + help = "plot height in inches" + ) +) + +opt_parser = OptionParser(option_list = option_list) +opt = parse_args(opt_parser) + +# Load data +cat("Loading data from:", opt$input, "\n") +df <- fread(opt$input) +cat("Loaded", nrow(df), "rows and", ncol(df), "columns\n") +cat("Columns:", paste(colnames(df), collapse = ", "), "\n") + +# Parse methods +methods <- strsplit(opt$methods, ",")[[1]] +methods <- trimws(methods) + +# Color palette +colors <- brewer.pal(min(11, length(unique(df[[opt$color_by]]))), "Spectral") + +# Create plots for each method +plots <- list() + +for (method in methods) { + x_col <- paste0(method, "_0") + y_col <- paste0(method, "_1") + + if (x_col %in% colnames(df) && y_col %in% colnames(df)) { + cat("Creating plot for", method, "\n") + + p <- ggplot(df, aes_string(x = x_col, y = y_col, color = opt$color_by)) + + geom_point(alpha = opt$alpha, size = opt$point_size) + + scale_color_manual(values = colors) + + theme_minimal() + + theme( + legend.position = "bottom", + plot.title = element_text(hjust = 0.5, size = 14, face = "bold"), + axis.title = element_text(size = 12), + legend.title = element_text(size = 11), + legend.text = element_text(size = 10) + ) + + labs( + title = paste(toupper(method), "Dimensionality Reduction"), + x = paste(toupper(method), "Component 1"), + y = paste(toupper(method), "Component 2"), + color = opt$color_by + ) + + guides(color = guide_legend(override.aes = list(alpha = 1, size = 3))) + + plots[[method]] <- p + } else { + cat("Warning: Columns", x_col, "and/or", y_col, "not found for method", method, "\n") + } +} + +# Create combined plot +if (length(plots) > 0) { + cat("Creating combined plot with", length(plots), "methods\n") + + # Arrange plots in a grid + if (length(plots) == 1) { + combined_plot <- plots[[1]] + } else if (length(plots) == 2) { + combined_plot <- grid.arrange(plots[[1]], plots[[2]], ncol = 2) + } else if (length(plots) <= 4) { + combined_plot <- grid.arrange(grobs = plots, ncol = 2) + } else { + combined_plot <- grid.arrange(grobs = plots, ncol = 3) + } + + # Save plot + cat("Saving plot to:", opt$output, "\n") + ggsave(opt$output, combined_plot, width = opt$width, height = opt$height, dpi = 300) + + cat("Plot saved successfully!\n") +} else { + cat("Error: No valid methods found to plot\n") + quit(status = 1) +} + +# Print summary statistics +cat("\n=== Summary Statistics ===\n") +for (method in methods) { + x_col <- paste0(method, "_0") + y_col <- paste0(method, "_1") + + if (x_col %in% colnames(df) && y_col %in% colnames(df)) { + cat(sprintf("%s - Component 1: mean=%.3f, sd=%.3f\n", + toupper(method), mean(df[[x_col]], na.rm = TRUE), sd(df[[x_col]], na.rm = TRUE))) + cat(sprintf("%s - Component 2: mean=%.3f, sd=%.3f\n", + toupper(method), mean(df[[y_col]], na.rm = TRUE), sd(df[[y_col]], na.rm = TRUE))) + } +} + +# If clustering results are available, show cluster summary +if ("kmeans_cluster" %in% colnames(df)) { + cat("\n=== K-means Clustering Summary ===\n") + cluster_counts <- table(df$kmeans_cluster) + for (i in names(cluster_counts)) { + cat(sprintf("Cluster %s: %d points (%.1f%%)\n", + i, cluster_counts[i], 100 * cluster_counts[i] / nrow(df))) + } +} + +cat("\nDone!\n") \ No newline at end of file diff --git a/README.md b/README.md index 6b6660d..03228c0 100644 --- a/README.md +++ b/README.md @@ -3,4 +3,45 @@ ## Cat umaps +This repository provides dimensionality reduction techniques for geospatial tracking data, including UMAP and additional methods like PCA, t-SNE, SVD, and ICA. + +### Features + +- **UMAP**: Original implementation for non-linear dimensionality reduction +- **Multiple techniques**: PCA, t-SNE, Truncated SVD, and ICA for different use cases +- **Scalable**: Handles millions of geospatial features efficiently +- **Clustering**: Built-in K-means clustering for meaningful data grouping +- **Visualization**: Automated plotting tools for exploring results +- **R integration**: Compatible with existing R plotting scripts + +### New Dimensionality Reduction Tools + +Beyond the original UMAP implementation, this repository now includes additional dimensionality reduction techniques optimized for geospatial data: + +```bash +# Apply multiple methods to your data +python3 dimensionality_reduction.py \ + --input output/raw.tsv.gz \ + --output output/results.tsv.gz \ + --columns lat lon Speed \ + --methods pca svd tsne \ + --standardize + +# Create visualizations +python3 plot_dim_reduction.py \ + --input output/results.tsv.gz \ + --output plots.png \ + --methods pca svd \ + --color_by Activity +``` + +For detailed documentation, see [DIMENSIONALITY_REDUCTION.md](DIMENSIONALITY_REDUCTION.md). + +### Quick Start + +1. **Run the example**: `python3 example_usage.py` +2. **Process your data**: Use `main.py` with `--outputRaw` to generate input files +3. **Apply techniques**: Use `dimensionality_reduction.py` with your preferred methods +4. **Visualize results**: Use `plot_dim_reduction.py` or the existing R scripts + ![](docs/index_files/figure-html/unnamed-chunk-1-1.jpeg)![](docs/index_files/figure-html/unnamed-chunk-1-2.jpeg)![](docs/index_files/figure-html/unnamed-chunk-1-3.jpeg)![](docs/index_files/figure-html/unnamed-chunk-1-4.jpeg)![](docs/index_files/figure-html/unnamed-chunk-1-5.jpeg)![](docs/index_files/figure-html/unnamed-chunk-1-6.jpeg)![](docs/index_files/figure-html/unnamed-chunk-1-7.jpeg)![](docs/index_files/figure-html/unnamed-chunk-1-8.jpeg)![](docs/index_files/figure-html/unnamed-chunk-1-9.jpeg)![](docs/index_files/figure-html/unnamed-chunk-1-10.jpeg)![](docs/index_files/figure-html/unnamed-chunk-1-11.jpeg) diff --git a/dimensionality_reduction.py b/dimensionality_reduction.py new file mode 100755 index 0000000..878ec2b --- /dev/null +++ b/dimensionality_reduction.py @@ -0,0 +1,276 @@ +#!/usr/bin/env python3 +""" +Additional dimensionality reduction techniques for geospatial data. +Processes outputRaw files from main.py and applies various dimensionality reduction methods. + +Supports: PCA, t-SNE, Truncated SVD, and other techniques for clustering millions of geojson features. +""" + +import argparse +import sys +import pandas as pd +import numpy as np +from sklearn.decomposition import PCA, TruncatedSVD, FastICA +from sklearn.manifold import TSNE +from sklearn.preprocessing import StandardScaler +from sklearn.cluster import KMeans +import warnings + +# Suppress some common warnings for cleaner output +warnings.filterwarnings("ignore", category=FutureWarning) +warnings.filterwarnings("ignore", category=UserWarning) + + +def standardize_columns(df, columns): + """ + Standardize the specified columns to mean=0, std=1 + Returns the dataframe with new standardized columns and the list of new column names + """ + scaler = StandardScaler() + new_columns = [] + + for column in columns: + if column in df.columns: + new_column = column + '_standardized' + df[new_column] = scaler.fit_transform(df[[column]]) + new_columns.append(new_column) + else: + print(f"Warning: Column '{column}' not found in data") + + return df, new_columns + + +def run_pca(df, columns, components, standardize=True): + """ + Apply Principal Component Analysis (PCA) + Fast linear technique, good for initial exploration + """ + operating_columns = columns.copy() + + if standardize: + print(f"Standardizing columns for PCA: {operating_columns}") + df, operating_columns = standardize_columns(df, operating_columns) + + print(f"Running PCA on columns: {operating_columns} with {components} components") + + pca = PCA(n_components=components, random_state=42) + embedding = pca.fit_transform(df[operating_columns]) + + # Add PCA results to dataframe + for i in range(components): + df[f'pca_{i}'] = embedding[:, i] + + # Print explained variance ratio + explained_var = pca.explained_variance_ratio_ + total_explained = sum(explained_var) + print(f"PCA explained variance ratio per component: {explained_var}") + print(f"Total explained variance: {total_explained:.3f}") + + return df + + +def run_tsne(df, columns, components, standardize=True, perplexity=30, learning_rate=200, max_iter=1000): + """ + Apply t-Distributed Stochastic Neighbor Embedding (t-SNE) + Non-linear technique, excellent for visualization but slower + """ + operating_columns = columns.copy() + + if standardize: + print(f"Standardizing columns for t-SNE: {operating_columns}") + df, operating_columns = standardize_columns(df, operating_columns) + + print(f"Running t-SNE on columns: {operating_columns} with {components} components") + print(f"Parameters: perplexity={perplexity}, learning_rate={learning_rate}, max_iter={max_iter}") + + # For large datasets, we might want to sample first + if len(df) > 10000: + print(f"Large dataset ({len(df)} rows). Consider using --sample for faster t-SNE computation.") + + tsne = TSNE(n_components=components, perplexity=perplexity, + learning_rate=learning_rate, max_iter=max_iter, random_state=42, verbose=1) + embedding = tsne.fit_transform(df[operating_columns]) + + # Add t-SNE results to dataframe + for i in range(components): + df[f'tsne_{i}'] = embedding[:, i] + + return df + + +def run_truncated_svd(df, columns, components, standardize=True): + """ + Apply Truncated Singular Value Decomposition (SVD) + Very fast linear technique, excellent for large datasets + """ + operating_columns = columns.copy() + + if standardize: + print(f"Standardizing columns for Truncated SVD: {operating_columns}") + df, operating_columns = standardize_columns(df, operating_columns) + + print(f"Running Truncated SVD on columns: {operating_columns} with {components} components") + + svd = TruncatedSVD(n_components=components, random_state=42) + embedding = svd.fit_transform(df[operating_columns]) + + # Add SVD results to dataframe + for i in range(components): + df[f'svd_{i}'] = embedding[:, i] + + # Print explained variance ratio + explained_var = svd.explained_variance_ratio_ + total_explained = sum(explained_var) + print(f"SVD explained variance ratio per component: {explained_var}") + print(f"Total explained variance: {total_explained:.3f}") + + return df + + +def run_ica(df, columns, components, standardize=True): + """ + Apply Independent Component Analysis (ICA) + Good for finding independent source signals + """ + operating_columns = columns.copy() + + if standardize: + print(f"Standardizing columns for ICA: {operating_columns}") + df, operating_columns = standardize_columns(df, operating_columns) + + print(f"Running ICA on columns: {operating_columns} with {components} components") + + ica = FastICA(n_components=components, random_state=42, max_iter=1000) + embedding = ica.fit_transform(df[operating_columns]) + + # Add ICA results to dataframe + for i in range(components): + df[f'ica_{i}'] = embedding[:, i] + + return df + + +def add_kmeans_clusters(df, columns, n_clusters=8, standardize=True): + """ + Add K-means clustering results using the specified columns + """ + operating_columns = columns.copy() + + if standardize: + df_temp, operating_columns = standardize_columns(df.copy(), operating_columns) + else: + df_temp = df.copy() + + print(f"Running K-means clustering with {n_clusters} clusters on columns: {operating_columns}") + + kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10) + clusters = kmeans.fit_predict(df_temp[operating_columns]) + + df['kmeans_cluster'] = clusters + + return df + + +def load_data(input_file): + """Load data from TSV.gz file""" + print(f"Loading data from: {input_file}") + df = pd.read_csv(input_file, sep='\t', compression='gzip') + print(f"Loaded {len(df)} rows and {len(df.columns)} columns") + print(f"Columns: {list(df.columns)}") + return df + + +def main(): + parser = argparse.ArgumentParser( + description='Apply various dimensionality reduction techniques to geospatial data' + ) + + # Input/Output arguments + parser.add_argument('--input', type=str, required=True, + help='Input TSV.gz file (outputRaw from main.py)') + parser.add_argument('--output', type=str, required=True, + help='Output TSV.gz file with dimensionality reduction results') + + # Column selection + parser.add_argument('--columns', nargs='+', default=['lat', 'lon', 'Speed'], + help='Columns to use for dimensionality reduction') + + # Dimensionality reduction methods + parser.add_argument('--methods', nargs='+', + choices=['pca', 'tsne', 'svd', 'ica'], + default=['pca', 'svd'], + help='Dimensionality reduction methods to apply') + + # General parameters + parser.add_argument('--components', type=int, default=2, + help='Number of components/dimensions to reduce to') + parser.add_argument('--standardize', action='store_true', + help='Standardize columns before applying techniques') + + # Sampling for large datasets + parser.add_argument('--sample', type=int, default=None, + help='Sample N rows for faster computation (useful for t-SNE)') + + # t-SNE specific parameters + parser.add_argument('--tsne_perplexity', type=float, default=30, + help='t-SNE perplexity parameter') + parser.add_argument('--tsne_learning_rate', type=float, default=200, + help='t-SNE learning rate') + parser.add_argument('--tsne_max_iter', type=int, default=1000, + help='t-SNE maximum number of iterations') + + # Clustering + parser.add_argument('--add_clusters', action='store_true', + help='Add K-means clustering results') + parser.add_argument('--n_clusters', type=int, default=8, + help='Number of clusters for K-means') + + args = parser.parse_args() + + # Load data + df = load_data(args.input) + + # Check if required columns exist + missing_columns = [col for col in args.columns if col not in df.columns] + if missing_columns: + print(f"Error: Missing columns in data: {missing_columns}") + print(f"Available columns: {list(df.columns)}") + sys.exit(1) + + # Sample data if requested + if args.sample and args.sample < len(df): + print(f"Sampling {args.sample} rows from {len(df)} total rows") + df = df.sample(n=args.sample, random_state=42).reset_index(drop=True) + + # Apply dimensionality reduction methods + for method in args.methods: + print(f"\n{'='*50}") + print(f"Applying {method.upper()}") + print(f"{'='*50}") + + if method == 'pca': + df = run_pca(df, args.columns, args.components, args.standardize) + elif method == 'tsne': + df = run_tsne(df, args.columns, args.components, args.standardize, + args.tsne_perplexity, args.tsne_learning_rate, args.tsne_max_iter) + elif method == 'svd': + df = run_truncated_svd(df, args.columns, args.components, args.standardize) + elif method == 'ica': + df = run_ica(df, args.columns, args.components, args.standardize) + + # Add clustering if requested + if args.add_clusters: + print(f"\n{'='*50}") + print("Adding K-means clustering") + print(f"{'='*50}") + df = add_kmeans_clusters(df, args.columns, args.n_clusters, args.standardize) + + # Save results + print(f"\nSaving results to: {args.output}") + df.to_csv(args.output, sep='\t', compression='gzip', index=False) + print(f"Final dataset has {len(df)} rows and {len(df.columns)} columns") + print(f"New columns added: {[col for col in df.columns if any(method in col for method in args.methods)]}") + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/example_usage.py b/example_usage.py new file mode 100755 index 0000000..badc598 --- /dev/null +++ b/example_usage.py @@ -0,0 +1,170 @@ +#!/usr/bin/env python3 +""" +Example usage of the dimensionality reduction tools for catUmap. + +This script demonstrates how to use the new dimensionality reduction techniques +with geospatial data from GeoJSON features. +""" + +import subprocess +import os +from pathlib import Path + +def run_command(cmd, description): + """Run a shell command and print the output""" + print(f"\n{'='*60}") + print(f"RUNNING: {description}") + print(f"COMMAND: {cmd}") + print('='*60) + + result = subprocess.run(cmd, shell=True, capture_output=True, text=True) + + if result.stdout: + print("STDOUT:") + print(result.stdout) + + if result.stderr: + print("STDERR:") + print(result.stderr) + + if result.returncode != 0: + print(f"Command failed with return code: {result.returncode}") + return False + + return True + +def main(): + # Ensure we're in the right directory + os.chdir('/home/runner/work/catUmap/catUmap') + + # Create output directory + Path('output').mkdir(exist_ok=True) + + print("CATMAP DIMENSIONALITY REDUCTION EXAMPLE") + print("This example demonstrates various dimensionality reduction techniques") + print("that can be applied to geospatial tracking data.") + + # Step 1: Generate test data (simulating outputRaw from main.py) + print("\n" + "="*60) + print("STEP 1: Generating test geospatial data") + print("="*60) + + run_command("python3 generate_test_data.py", "Generate sample geospatial data") + + # Step 2: Apply PCA and SVD (fast linear methods) + print("\n" + "="*60) + print("STEP 2: Applying fast linear dimensionality reduction (PCA + SVD)") + print("="*60) + + cmd = ("python3 dimensionality_reduction.py " + "--input output/test_raw.tsv.gz " + "--output output/linear_methods.tsv.gz " + "--columns lat lon Speed Elevation " + "--methods pca svd " + "--standardize " + "--add_clusters --n_clusters 6") + + run_command(cmd, "Apply PCA and SVD with clustering") + + # Step 3: Apply ICA (for independent components) + print("\n" + "="*60) + print("STEP 3: Applying Independent Component Analysis (ICA)") + print("="*60) + + cmd = ("python3 dimensionality_reduction.py " + "--input output/test_raw.tsv.gz " + "--output output/ica_method.tsv.gz " + "--columns lat lon Speed " + "--methods ica " + "--standardize") + + run_command(cmd, "Apply ICA for independent components") + + # Step 4: Apply t-SNE (non-linear, good for visualization) + print("\n" + "="*60) + print("STEP 4: Applying t-SNE (non-linear visualization)") + print("="*60) + + cmd = ("python3 dimensionality_reduction.py " + "--input output/test_raw.tsv.gz " + "--output output/tsne_method.tsv.gz " + "--columns lat lon Speed " + "--methods tsne " + "--standardize " + "--sample 2000 " # Sample for faster t-SNE + "--tsne_perplexity 50 " + "--tsne_learning_rate 200") + + run_command(cmd, "Apply t-SNE with custom parameters") + + # Step 5: Create comprehensive analysis with all methods + print("\n" + "="*60) + print("STEP 5: Comprehensive analysis with multiple methods") + print("="*60) + + cmd = ("python3 dimensionality_reduction.py " + "--input output/test_raw.tsv.gz " + "--output output/comprehensive_analysis.tsv.gz " + "--columns lat lon Speed Accuracy " + "--methods pca svd ica " + "--components 3 " # 3D reduction + "--standardize " + "--add_clusters --n_clusters 8") + + run_command(cmd, "Comprehensive analysis with 3D reduction") + + # Step 6: Generate visualizations + print("\n" + "="*60) + print("STEP 6: Creating visualizations") + print("="*60) + + # Plot linear methods + cmd = ("python3 plot_dim_reduction.py " + "--input output/linear_methods.tsv.gz " + "--output output/linear_methods_plot.png " + "--methods pca svd " + "--color_by Activity") + + run_command(cmd, "Plot PCA and SVD results") + + # Plot comprehensive analysis + cmd = ("python3 plot_dim_reduction.py " + "--input output/comprehensive_analysis.tsv.gz " + "--output output/comprehensive_plot.png " + "--methods pca svd ica " + "--color_by Name") + + run_command(cmd, "Plot comprehensive analysis results") + + # Plot t-SNE results + cmd = ("python3 plot_dim_reduction.py " + "--input output/tsne_method.tsv.gz " + "--output output/tsne_plot.png " + "--methods tsne " + "--color_by Activity") + + run_command(cmd, "Plot t-SNE results") + + # Summary + print("\n" + "="*60) + print("EXAMPLE COMPLETED!") + print("="*60) + print("\nGenerated files:") + print("- output/test_raw.tsv.gz (sample raw data)") + print("- output/linear_methods.tsv.gz (PCA + SVD results)") + print("- output/ica_method.tsv.gz (ICA results)") + print("- output/tsne_method.tsv.gz (t-SNE results)") + print("- output/comprehensive_analysis.tsv.gz (multiple methods)") + print("\nGenerated plots:") + print("- output/linear_methods_plot.png") + print("- output/comprehensive_plot.png") + print("- output/tsne_plot.png") + + print("\nNext steps:") + print("1. Examine the generated plots to compare different methods") + print("2. Use the clustering results for further analysis") + print("3. Apply these techniques to your actual geospatial data") + print("4. Experiment with different column combinations and parameters") + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/plot_dim_reduction.py b/plot_dim_reduction.py new file mode 100755 index 0000000..d43cf86 --- /dev/null +++ b/plot_dim_reduction.py @@ -0,0 +1,150 @@ +#!/usr/bin/env python3 +""" +Simple plotting script for dimensionality reduction results. +Creates scatter plots for PCA, t-SNE, SVD, and ICA results. +""" + +import argparse +import pandas as pd +import matplotlib.pyplot as plt +import seaborn as sns +import numpy as np +from pathlib import Path + +def load_data(input_file): + """Load the dimensionality reduction results""" + print(f"Loading data from: {input_file}") + df = pd.read_csv(input_file, sep='\t', compression='gzip') + print(f"Loaded {len(df)} rows and {len(df.columns)} columns") + return df + +def create_plots(df, methods, color_by='Activity', output_file='plots.png', figsize=(16, 12)): + """Create scatter plots for each dimensionality reduction method""" + + # Filter available methods + available_methods = [] + for method in methods: + x_col = f'{method}_0' + y_col = f'{method}_1' + if x_col in df.columns and y_col in df.columns: + available_methods.append(method) + else: + print(f"Warning: Method '{method}' not found in data") + + if not available_methods: + print("Error: No valid methods found in data") + return + + # Set up the plotting style + plt.style.use('default') + sns.set_palette("Set1") + + # Create subplots + n_methods = len(available_methods) + if n_methods == 1: + fig, axes = plt.subplots(1, 1, figsize=figsize) + axes = [axes] + elif n_methods <= 4: + fig, axes = plt.subplots(2, 2, figsize=figsize) + axes = axes.flatten() + else: + fig, axes = plt.subplots(2, 3, figsize=figsize) + axes = axes.flatten() + + # Color mapping + if color_by in df.columns: + unique_colors = df[color_by].unique() + color_map = dict(zip(unique_colors, sns.color_palette("Set1", len(unique_colors)))) + colors = [color_map[val] for val in df[color_by]] + else: + print(f"Warning: Color column '{color_by}' not found, using default colors") + colors = 'blue' + + # Create plots for each method + for i, method in enumerate(available_methods): + x_col = f'{method}_0' + y_col = f'{method}_1' + + ax = axes[i] + + # Create scatter plot + if color_by in df.columns: + for category in df[color_by].unique(): + mask = df[color_by] == category + ax.scatter(df.loc[mask, x_col], df.loc[mask, y_col], + label=category, alpha=0.6, s=20) + ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=8) + else: + ax.scatter(df[x_col], df[y_col], alpha=0.6, s=20, color='blue') + + ax.set_xlabel(f'{method.upper()} Component 1') + ax.set_ylabel(f'{method.upper()} Component 2') + ax.set_title(f'{method.upper()} Dimensionality Reduction') + ax.grid(True, alpha=0.3) + + # Hide unused subplots + for i in range(len(available_methods), len(axes)): + axes[i].set_visible(False) + + plt.tight_layout() + plt.savefig(output_file, dpi=300, bbox_inches='tight') + print(f"Plot saved to: {output_file}") + + return fig + +def print_summary(df, methods): + """Print summary statistics for the dimensionality reduction results""" + print("\n" + "="*50) + print("SUMMARY STATISTICS") + print("="*50) + + for method in methods: + x_col = f'{method}_0' + y_col = f'{method}_1' + + if x_col in df.columns and y_col in df.columns: + print(f"\n{method.upper()}:") + print(f" Component 1: mean={df[x_col].mean():.3f}, std={df[x_col].std():.3f}") + print(f" Component 2: mean={df[y_col].mean():.3f}, std={df[y_col].std():.3f}") + + # Clustering summary if available + if 'kmeans_cluster' in df.columns: + print(f"\nK-MEANS CLUSTERING:") + cluster_counts = df['kmeans_cluster'].value_counts().sort_index() + for cluster, count in cluster_counts.items(): + pct = 100 * count / len(df) + print(f" Cluster {cluster}: {count} points ({pct:.1f}%)") + +def main(): + parser = argparse.ArgumentParser( + description='Plot dimensionality reduction results' + ) + + parser.add_argument('--input', type=str, required=True, + help='Input TSV.gz file with dimensionality reduction results') + parser.add_argument('--output', type=str, default='dim_reduction_plots.png', + help='Output plot file') + parser.add_argument('--methods', nargs='+', + choices=['pca', 'tsne', 'svd', 'ica'], + default=['pca', 'svd', 'ica'], + help='Methods to plot') + parser.add_argument('--color_by', type=str, default='Activity', + help='Column to use for coloring points') + parser.add_argument('--figsize', nargs=2, type=float, default=[16, 12], + help='Figure size (width height)') + + args = parser.parse_args() + + # Load data + df = load_data(args.input) + + # Create plots + fig = create_plots(df, args.methods, args.color_by, args.output, tuple(args.figsize)) + + # Print summary + print_summary(df, args.methods) + + print(f"\nDone! Plot saved to {args.output}") + +if __name__ == '__main__': + main() \ No newline at end of file