modelscope · cyruszhang · Jul 16, 2025 · Jul 23, 2025 · Jul 23, 2025 · Jul 23, 2025
diff --git a/configs/demo/checkpoint_config_example.yaml b/configs/demo/checkpoint_config_example.yaml
@@ -0,0 +1,172 @@
+# =============================================================================
+# COMPREHENSIVE DATAJUICER DEMO: Checkpointing, Event Logging & Job Management
+# =============================================================================
+# This demo showcases:
+# 1. Configurable checkpointing strategies
+# 2. Event logging with job-specific directories
+# 3. Flexible storage architecture
+# 4. Job resumption capabilities
+# 5. Real DataJuicer operations
+# =============================================================================
+
+# Global parameters
+work_dir: "./outputs/demo-checkpoint-strategies"
+
+# Separate storage configuration (optional)
+# Event logs: Fast storage (SSD, local disk) - small files, frequent writes
+event_log_dir: "/tmp/fast_event_logs"  # Optional: separate fast storage for event logs
+
+# Checkpoints: Large storage (HDD, network storage) - large files, infrequent writes
+checkpoint_dir: "/tmp/large_checkpoints"  # Optional: separate large storage for checkpoints
+
+# Executor configuration
+executor_type: "ray_partitioned"  # Use our enhanced partitioned executor
+
+
+# Intermediate storage configuration for partition and checkpoint data (format, compression, and lifecycle management)
+intermediate_storage:
+  # File format and compression
+  format: "parquet"  # parquet, arrow, jsonl
+  compression: "snappy"  # snappy, gzip, none
+  use_arrow_batches: true
+  arrow_batch_size: 500
+  arrow_memory_mapping: false
+
+  # File lifecycle management
+  preserve_intermediate_data: true  # Keep temporary files for debugging/resumption
+  cleanup_temp_files: true
+  cleanup_on_success: false
+  retention_policy: "keep_all"  # keep_all, keep_failed_only, cleanup_all
+  max_retention_days: 7
+
+
+# Partitioning configuration
+partition:
+  # Auto-configuration (recommended for most use cases)
+  auto_configure: false  # Disable auto-configuration to use manual settings
+
+  # Manual partitioning settings (used when auto_configure: false)
+  # Recommended partition sizes:
+  # - 50-100: For debugging, quick iterations, small datasets
+  # - 100-300: For production, good balance of fault tolerance and efficiency
+  # - 300-500: For large datasets with stable processing
+  # - 500+: Only for very large datasets with minimal failure risk
+  size: 50000  # Number of samples per partition (smaller for better fault tolerance)
+  max_size_mb: 128  # Maximum partition size in MB (reduced for faster processing)
+
+  # Fault tolerance settings
+  enable_fault_tolerance: true
+  max_retries: 3
+  retry_backoff: "exponential"  # exponential, linear, fixed
+
+# Checkpoint configuration
+checkpoint:
+  enabled: true
+  strategy: "every_op"  # every_op, every_partition, every_n_ops, manual, disabled
+  n_ops: 2  # For every_n_ops strategy
+  op_names: ["clean_links_mapper", "whitespace_normalization_mapper"]  # For manual strategy
+
+# Event logging configuration
+event_logging:
+  enabled: true
+  max_log_size_mb: 100
+  backup_count: 5
+
+# Ray configuration
+ray_address: "auto"
+np: 2  # Number of Ray workers
+
+# Dataset configuration
+dataset_path: './demos/data/demo-dataset.jsonl'
+export_path: './outputs/demo-checkpoint-strategies/processed.jsonl'
+
+# Process pipeline with real DataJuicer operations
+process:
+  # Text cleaning operations
+  - clean_links_mapper:
+      text_key: "text"
+      min_links: 0
+      max_links: 10
+
+  - clean_email_mapper:
+      text_key: "text"
+      min_emails: 0
+      max_emails: 5
+
+  - whitespace_normalization_mapper:
+      text_key: "text"
+
+  - fix_unicode_mapper:
+      text_key: "text"
+
+  # Text filtering operations
+  - text_length_filter:
+      text_key: "text"
+      min_len: 10
+      max_len: 10000
+
+  - alphanumeric_filter:
+      text_key: "text"
+      min_ratio: 0.3
+
+  # Quality filtering
+  - character_repetition_filter:
+      text_key: "text"
+      min_ratio: 0.0
+      max_ratio: 0.3
+
+  - word_repetition_filter:
+      text_key: "text"
+      min_ratio: 0.0
+      max_ratio: 0.3
+
+# Export configuration
+export_in_parallel: true
+keep_stats_in_res_ds: true
+keep_hashes_in_res_ds: true
+
+# =============================================================================
+# COMPLETE USER EXPERIENCE:
+# =============================================================================
+# 1. Start job:
+#    dj-process --config configs/demo/checkpoint_config_example.yaml
+#    # Output shows: Job ID (timestamp_configname_suffix), job directory, resumption command
+#    # Example: 20241201_143022_checkpoint_config_example_abc123
+#
+# 2. If job fails, resume with:
+#    dj-process --config configs/demo/checkpoint_config_example.yaml --job_id <job_id>
+#    # System validates job_id and shows previous status
+#
+# 3. Directory structure (flexible storage):
+#    Option A: All in work_dir (default)
+#    {work_dir}/
+#    ├── 20241201_143022_checkpoint_config_example_abc123/  # Job-specific directory
+#    │   ├── job_summary.json         # Job metadata and status
+#    │   ├── event_logs/
+#    │   │   ├── events.log           # Human-readable logs
+#    │   │   └── events.jsonl         # Machine-readable for resumption
+#    │   ├── checkpoints/             # Job-specific checkpoint data
+#    │   │   ├── partition_000000/
+#    │   │   │   ├── op_000_clean_links_mapper.parquet
+#    │   │   │   └── op_001_clean_email_mapper.parquet
+#    │   │   └── checkpoint_1701432000.json
+#    │   └── metadata/                # Job-specific metadata
+#    │       └── dataset_mapping.json
+#
+#    Option B: Separate storage (configured)
+#    {work_dir}/
+#    ├── 20241201_143022_checkpoint_config_example_abc123/  # Job metadata only
+#    │   └── job_summary.json
+#    /tmp/fast_event_logs/            # Fast storage for event logs
+#    ├── 20241201_143022_checkpoint_config_example_abc123/
+#    │   └── event_logs/
+#    │       ├── events.log
+#    │       └── events.jsonl
+#    /tmp/large_checkpoints/          # Large storage for checkpoints
+#    ├── 20241201_143022_checkpoint_config_example_abc123/
+#    │   ├── partition_000000/
+#    │   │   ├── op_000_clean_links_mapper.parquet
+#    │   │   └── op_001_clean_email_mapper.parquet
+#    │   └── checkpoint_1701432000.json
+#    └── results/                     # Shared final results
+# =============================================================================