From 6aa8a995b935340fd4be5e680b52587c3f448acc Mon Sep 17 00:00:00 2001 From: Yusuf Ozturk Date: Fri, 24 Oct 2025 22:29:20 +0200 Subject: [PATCH 01/13] Change settings location --- sidebars.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sidebars.ts b/sidebars.ts index 2dab6ad4..d0025c18 100644 --- a/sidebars.ts +++ b/sidebars.ts @@ -44,6 +44,8 @@ const sidebars: SidebarsConfig = { "organization/users", "organization/roles", "organization/audit", + "organization/tenants", + "organization/usage-and-limits", { type: "category", label: "Settings", @@ -54,8 +56,6 @@ const sidebars: SidebarsConfig = { "organization/settings/subscription", ] }, - "organization/tenants", - "organization/usage-and-limits", ] }, { From 72c91ec9e0a5191a26c23acdb5d3d585503305e7 Mon Sep 17 00:00:00 2001 From: Yusuf Ozturk Date: Fri, 24 Oct 2025 23:21:10 +0200 Subject: [PATCH 02/13] Doc updates --- docs/configuration/scheduling/interval.mdx | 491 ++++++++++++++++++ docs/configuration/scheduling/overview.mdx | 518 +++++++++++++++++++ docs/configuration/scheduling/schedule.mdx | 440 ++++++++++++++++ docs/configuration/targets/elasticsearch.mdx | 208 +++++++- docs/configuration/targets/splunk-hec.mdx | 126 ++++- 5 files changed, 1760 insertions(+), 23 deletions(-) create mode 100644 docs/configuration/scheduling/interval.mdx create mode 100644 docs/configuration/scheduling/overview.mdx create mode 100644 docs/configuration/scheduling/schedule.mdx diff --git a/docs/configuration/scheduling/interval.mdx b/docs/configuration/scheduling/interval.mdx new file mode 100644 index 00000000..91d43963 --- /dev/null +++ b/docs/configuration/scheduling/interval.mdx @@ -0,0 +1,491 @@ +# Interval + +## Synopsis + +Interval controls how frequently a target or route executes, measured in time units or seconds. This provides simple, periodic execution without the complexity of cron expressions. + +## Schema + +### Targets + +```yaml +targets: + - name: + type: + properties: + interval: + # target-specific properties +``` + +### Routes + +```yaml +routes: + - name: + interval: + source: + destination: +``` + +## Configuration + +|Field|Required|Default|Description| +|---|---|---|---| +|`interval`|N|realtime|Execution frequency using time units (5m, 1h) or seconds (300)| + +## Details + +The interval field determines how often a target or route processes queued data. When configured, the component will execute at regular intervals rather than continuously. This helps: + +- Control resource consumption +- Batch data for more efficient processing +- Reduce API call frequency to external services +- Manage costs for volume-based services +- Create time-based routing logic + +Between interval executions, data accumulates in the queue and is processed during the next execution window. + +### Format Options + +Interval accepts two formats: + +**Time Units (Recommended)** +- Append a unit suffix to the number +- More readable and self-documenting +- Units: `s` (seconds), `m` (minutes), `h` (hours), `d` (days) + +**Seconds (Numeric)** +- Plain integer representing seconds +- Useful for programmatic configuration +- Example: `300` equals 5 minutes + +### Special Values + +- Omit the field or set to `0` or `1` for realtime (continuous) execution +- Values less than 1 second are treated as realtime + +### Execution Behavior + +The first execution occurs immediately after component initialization. Subsequent executions occur at the specified interval after the previous execution completes. This means the actual time between executions includes both the interval delay and the processing time. + +## Target Examples + +### Every 5 Minutes + + + + Execute target every 5 minutes... + + + ```yaml + targets: + - name: frequent_splunk + type: splunk + properties: + interval: "5m" + endpoints: + - endpoint: "https://splunk.example.com:8088/services/collector" + token: "YOUR-TOKEN" + index: "logs" + ``` + + + +### Every Hour + + + + Execute target every hour using time unit format... + + + ```yaml + targets: + - name: hourly_elastic + type: elastic + properties: + interval: "1h" + index: "hourly-logs" + endpoints: + - endpoint: "http://elasticsearch:9200" + ``` + + + +### Every 30 Seconds + + + + Execute target every 30 seconds for near-realtime processing... + + + ```yaml + targets: + - name: rapid_kafka + type: kafka + properties: + interval: "30s" + brokers: + - "kafka1:9092" + - "kafka2:9092" + topic: "logs" + ``` + + + +### Every 15 Minutes (Numeric) + + + + Execute every 15 minutes using numeric seconds format... + + + ```yaml + targets: + - name: quarter_hourly + type: splunk + properties: + interval: 900 + endpoints: + - endpoint: "https://splunk.example.com:8088/services/collector" + token: "YOUR-TOKEN" + index: "logs" + ``` + + + +### Every 6 Hours + + + + Execute target every 6 hours for batch processing... + + + ```yaml + targets: + - name: batch_s3 + type: awss3 + properties: + interval: "6h" + bucket: "log-archives" + region: "us-east-1" + ``` + + + +### Every 2 Hours + + + + Execute every 2 hours with large batches... + + + ```yaml + targets: + - name: batch_clickhouse + type: clickhouse + properties: + interval: "2h" + connection_string: "clickhouse://localhost:9000" + table: "logs" + batch_size: 50000 + ``` + + + +### Every Day + + + + Execute once per day for daily aggregations... + + + ```yaml + targets: + - name: daily_bigquery + type: bigquery + properties: + interval: "24h" + project: "my-project" + dataset: "daily_data" + tables: + - "logs" + ``` + + + +### Every 10 Minutes + + + + Execute every 10 minutes for moderate batching... + + + ```yaml + targets: + - name: moderate_elastic + type: elastic + properties: + interval: "10m" + index: "batched-logs" + batch_size: 20000 + endpoints: + - endpoint: "http://elasticsearch:9200" + ``` + + + +### Every 3 Hours + + + + Execute every 3 hours for cost optimization... + + + ```yaml + targets: + - name: cost_optimized + type: sentinel + properties: + interval: "3h" + workspace_id: "YOUR-WORKSPACE-ID" + shared_key: "YOUR-SHARED-KEY" + log_type: "CustomLogs" + ``` + + + +### Realtime Processing + + + + Omit interval for continuous, realtime processing... + + + ```yaml + targets: + - name: realtime_splunk + type: splunk + properties: + # No interval specified - processes continuously + endpoints: + - endpoint: "https://splunk.example.com:8088/services/collector" + token: "YOUR-TOKEN" + index: "realtime-logs" + ``` + + + +## Route Examples + +### Periodic Archive Route + + + + Route data to archive storage every hour... + + + ```yaml + routes: + - name: hourly_archive + interval: "1h" + source: "processed_logs" + destination: "s3_archive" + + - name: realtime_route + source: "processed_logs" + destination: "elasticsearch" + ``` + + + +### Batch Processing Route + + + + Route data to batch target every 15 minutes... + + + ```yaml + routes: + - name: batch_route + interval: "15m" + source: "application_logs" + destination: "batch_processor" + + - name: stream_route + source: "application_logs" + destination: "realtime_processor" + ``` + + + +### Cost Optimized Route + + + + Route data to expensive target every 6 hours to reduce costs... + + + ```yaml + routes: + - name: cost_optimized_route + interval: "6h" + source: "high_volume_logs" + destination: "expensive_analytics" + + - name: cheap_storage_route + source: "high_volume_logs" + destination: "s3_storage" + ``` + + + +### Aggregation Route + + + + Route aggregated metrics every 5 minutes while streaming raw data... + + + ```yaml + routes: + - name: aggregated_route + interval: "5m" + source: "metrics" + destination: "aggregated_storage" + + - name: raw_route + source: "metrics" + destination: "raw_storage" + ``` + + + +### Periodic Backup Route + + + + Route data to backup every 30 minutes... + + + ```yaml + routes: + - name: backup_route + interval: "30m" + source: "critical_logs" + destination: "backup_storage" + + - name: primary_route + source: "critical_logs" + destination: "primary_storage" + ``` + + + +## Common Intervals + +| Interval | Time Unit | Seconds | Use Case | +|----------|-----------|---------|----------| +| 30 seconds | `30s` | `30` | Near-realtime, low latency | +| 1 minute | `1m` | `60` | Frequent updates | +| 5 minutes | `5m` | `300` | Balanced batching | +| 10 minutes | `10m` | `600` | Moderate batching | +| 15 minutes | `15m` | `900` | Quarter-hourly processing | +| 30 minutes | `30m` | `1800` | Half-hourly processing | +| 1 hour | `1h` | `3600` | Hourly aggregation | +| 2 hours | `2h` | `7200` | Light batch processing | +| 6 hours | `6h` | `21600` | Heavy batch processing | +| 12 hours | `12h` | `43200` | Twice daily | +| 24 hours | `24h` | `86400` | Daily processing | + +## Comparison with Schedule + +| Feature | Interval | Schedule | +|---------|----------|----------| +| Format | Time units or seconds | Cron expression | +| Complexity | Simple | Complex | +| First execution | Immediate | Next matching time | +| Execution pattern | Fixed intervals | Specific times | +| Best for | Regular frequency | Time-specific events | + +Use `interval` when: +- You want simple, regular frequency +- Exact execution time doesn't matter +- You need immediate first execution +- Configuration should be simple and readable + +Use `schedule` when: +- You need specific execution times +- Business schedules matter (business hours, weekends) +- You want to avoid certain time windows +- Complex patterns are required + +## Performance Considerations + +### Short Intervals (< 1 minute) + +- Near-realtime processing +- Higher resource consumption +- More frequent API calls +- Better for latency-sensitive scenarios + +### Medium Intervals (1-30 minutes) + +- Balanced approach +- Reasonable batching +- Moderate resource usage +- Good for most use cases + +### Long Intervals (> 1 hour) + +- Large batch processing +- Lower resource consumption +- Reduced API call frequency +- Better for cost optimization + +### Batch Size Interaction + +Combine interval with batch size for optimal performance: + +```yaml +properties: + interval: "5m" + batch_size: 10000 +``` + +This ensures batches don't exceed 10,000 events even if more accumulate during the 5-minute interval. + +## Usage Notes + +- Interval measures time between executions, not including processing time +- If processing takes longer than the interval, the next execution waits for completion +- Schedule takes precedence if both interval and schedule are configured +- Use interval for simple periodic execution, schedule for complex time-based patterns +- Omitting interval defaults to realtime (continuous) processing +- Values less than 1 second are treated as realtime +- Multiple routes can have different intervals, creating time-based routing logic + +## Troubleshooting + +### Component Executing Too Frequently + +If the component runs more often than expected: +- Verify interval value and unit +- Check for multiple component configurations +- Ensure processing completes quickly to maintain interval + +### Component Not Executing at Expected Frequency + +If executions are delayed: +- Processing time may exceed interval duration +- Check system resources and component performance +- Consider increasing batch size to reduce execution frequency +- Review logs for errors or performance issues + +### Resource Consumption Issues + +If resources are constrained: +- Increase interval to reduce execution frequency +- Optimize batch size and component configuration +- Monitor queue depth between executions +- Consider moving to schedule for off-peak processing \ No newline at end of file diff --git a/docs/configuration/scheduling/overview.mdx b/docs/configuration/scheduling/overview.mdx new file mode 100644 index 00000000..5b75e960 --- /dev/null +++ b/docs/configuration/scheduling/overview.mdx @@ -0,0 +1,518 @@ +# Scheduling and Time-Based Execution + +## Overview + +Scheduling capabilities provide powerful control over when and how often your telemetry pipeline components execute. By adding temporal logic to targets and routes, you can optimize resource usage, reduce costs, implement time-based routing strategies, and align data processing with business requirements. + +## What You Can Do + +### Control Execution Timing + +Execute components at specific times or regular intervals: + +- **Targets**: Control when data is sent to destinations +- **Routes**: Control when data flows through specific paths + +### Optimize Resource Usage + +Reduce system load by processing data during specific time windows: + +- Run expensive operations during off-peak hours +- Batch data processing to reduce API calls +- Schedule resource-intensive targets for low-traffic periods +- Implement graduated data retention strategies + +### Implement Business Logic + +Align data processing with business schedules: + +- Process data only during business hours +- Route critical logs differently during peak times +- Archive data on specific days or times +- Create time-based data tiers (hot, warm, cold) + +### Reduce Costs + +Minimize expenses by controlling when and how often components execute: + +- Batch API calls to reduce per-request costs +- Route data to expensive services less frequently +- Use cheaper storage during off-peak hours +- Implement cost-effective retention policies + +## Scheduling Methods + +The platform provides two complementary approaches to time-based execution: + +### Schedule (Cron-Based) + +Execute components at specific times using cron expressions. + +**Best for:** +- Time-specific execution (daily at 3 AM, weekdays at 9 AM) +- Business hour constraints +- Calendar-based patterns (monthly reports, weekend processing) +- Complex time windows + +**Learn more:** Schedule Documentation + +### Interval (Frequency-Based) + +Execute components at regular intervals using simple time units. + +**Best for:** +- Regular periodic execution (every 5 minutes, every hour) +- Simple batching strategies +- Predictable resource consumption +- Straightforward configuration + +**Learn more:** Interval Documentation + +## Use Cases and Scenarios + +### Time-Based Data Routing + +Route data to different destinations based on time: + +```yaml +# Send to realtime analytics during business hours +routes: + - name: business_hours_analytics + schedule: "* 9-17 * * 1-5" + source: "application_logs" + destination: "realtime_splunk" + +# Archive all data once per day +routes: + - name: daily_archive + schedule: "0 0 * * *" + source: "application_logs" + destination: "s3_archive" + +# Default realtime processing +routes: + - name: default_route + source: "application_logs" + destination: "basic_storage" +``` + +**Benefits:** +- Realtime insights during business hours +- Cost-effective archival during off-hours +- Automatic failover to basic storage + +### Cost-Optimized Data Tiers + +Implement hot, warm, and cold data tiers: + +```yaml +# Hot tier - realtime expensive analytics +targets: + - name: hot_tier + type: elastic + properties: + index: "logs-hot" + endpoints: + - endpoint: "https://premium-es.example.com:9200" + +# Warm tier - hourly batch to moderate storage +targets: + - name: warm_tier + type: elastic + properties: + interval: "1h" + index: "logs-warm" + endpoints: + - endpoint: "https://standard-es.example.com:9200" + +# Cold tier - daily archive to cheap storage +targets: + - name: cold_tier + type: awss3 + schedule: "0 2 * * *" + properties: + bucket: "logs-cold" + region: "us-east-1" +``` + +**Benefits:** +- Realtime access to recent data +- Reduced costs for historical data +- Automatic data lifecycle management + +### Business Hours Processing + +Process critical data differently during business hours: + +```yaml +# High priority during business hours +routes: + - name: business_hours_critical + schedule: "* 9-17 * * 1-5" + source: "error_logs" + destination: "alert_system" + +# Batch processing after hours +routes: + - name: after_hours_batch + schedule: "0 18-8 * * *" + source: "error_logs" + destination: "batch_processor" +``` + +**Benefits:** +- Immediate alerts during work hours +- Efficient batch processing overnight +- Reduced alert fatigue after hours + +### Periodic Backup and Compliance + +Implement automated backup and compliance workflows: + +```yaml +# Hourly backup to secondary region +targets: + - name: backup_target + type: awss3 + properties: + interval: "1h" + bucket: "logs-backup" + region: "us-west-2" + +# Daily compliance archive +targets: + - name: compliance_archive + type: awss3 + schedule: "0 1 * * *" + properties: + bucket: "logs-compliance" + region: "us-east-1" + +# Weekly long-term archive +targets: + - name: longterm_archive + type: glacier + schedule: "0 3 * * 0" + properties: + vault: "logs-longterm" +``` + +**Benefits:** +- Disaster recovery protection +- Regulatory compliance +- Efficient long-term storage + +### Peak Load Management + +Handle varying data volumes throughout the day: + +```yaml +# Realtime processing during low-traffic hours +routes: + - name: night_realtime + schedule: "* 0-6,22-23 * * *" + source: "high_volume_logs" + destination: "realtime_processor" + +# Batched processing during peak hours +routes: + - name: peak_hours_batch + schedule: "*/15 7-21 * * *" + source: "high_volume_logs" + destination: "batch_processor" +``` + +**Benefits:** +- Reduced latency during low-traffic periods +- System stability during peak hours +- Efficient resource utilization + +### Multi-Region Data Distribution + +Distribute data across regions based on time zones: + +```yaml +# US region during US business hours +targets: + - name: us_target + type: splunk + schedule: "* 9-17 * * 1-5" + properties: + endpoints: + - endpoint: "https://us-splunk.example.com:8088/services/collector" + token: "US-TOKEN" + +# EU region during EU business hours +targets: + - name: eu_target + type: splunk + schedule: "* 9-17 * * 1-5" + properties: + endpoints: + - endpoint: "https://eu-splunk.example.com:8088/services/collector" + token: "EU-TOKEN" + +# Global 24/7 backup +targets: + - name: global_backup + type: awss3 + properties: + interval: "1h" + bucket: "logs-global" +``` + +**Benefits:** +- Reduced cross-region latency +- Compliance with data sovereignty +- 24/7 backup coverage + +### Graduated Retention Policy + +Implement automatic data lifecycle management: + +```yaml +# Keep everything for 7 days +targets: + - name: hot_storage + type: elastic + properties: + index: "logs-recent" + +# Archive weekly to cheaper storage +targets: + - name: warm_storage + type: clickhouse + schedule: "0 2 * * 0" + properties: + table: "logs_archive" + +# Monthly move to cold storage +targets: + - name: cold_storage + type: awss3 + schedule: "0 3 1 * *" + properties: + bucket: "logs-cold-archive" +``` + +**Benefits:** +- Fast access to recent data +- Cost-effective long-term storage +- Automated data lifecycle + +### Development vs Production + +Different schedules for different environments: + +```yaml +# Development - batch every 30 minutes +targets: + - name: dev_splunk + type: splunk + properties: + interval: "30m" + endpoints: + - endpoint: "https://dev-splunk.example.com:8088/services/collector" + token: "DEV-TOKEN" + +# Production - realtime processing +targets: + - name: prod_splunk + type: splunk + properties: + # No interval - continuous realtime + endpoints: + - endpoint: "https://prod-splunk.example.com:8088/services/collector" + token: "PROD-TOKEN" +``` + +**Benefits:** +- Reduced costs in development +- Realtime monitoring in production +- Environment-appropriate SLAs + +## Configuration Patterns + +### Combining Schedule and Interval + +Use both for different components in the same pipeline: + +```yaml +# Scheduled target for compliance +targets: + - name: compliance_target + type: awss3 + schedule: "0 0 * * *" + properties: + bucket: "compliance-logs" + +# Interval-based target for monitoring +targets: + - name: monitoring_target + type: elastic + properties: + interval: "5m" + index: "monitoring" + +# Realtime target for alerts +targets: + - name: alert_target + type: splunk + properties: + # No schedule or interval - realtime + index: "alerts" +``` + +### Conditional Time-Based Routes + +Create fallback routing with time constraints: + +```yaml +routes: + # Priority 1: Business hours to premium service + - name: priority_route + schedule: "* 9-17 * * 1-5" + source: "logs" + destination: "premium_target" + + # Priority 2: Night hours to standard service + - name: night_route + schedule: "* 0-8,18-23 * * *" + source: "logs" + destination: "standard_target" + + # Priority 3: Weekend to basic service + - name: weekend_route + schedule: "* * * * 0,6" + source: "logs" + destination: "basic_target" + + # Default: fallback to archive + - name: fallback_route + source: "logs" + destination: "archive_target" +``` + +## Best Practices + +### Start Simple + +Begin with interval-based execution for straightforward use cases: + +```yaml +properties: + interval: "5m" +``` + +Migrate to schedule when you need specific timing: + +```yaml +schedule: "0 */6 * * *" +``` + +### Consider Data Volume + +Match scheduling to expected data volume: + +- **High volume**: Use longer intervals or specific schedules +- **Low volume**: Realtime or short intervals work well +- **Variable volume**: Use time-based routing to handle peaks + +### Monitor Queue Depth + +Between scheduled executions, data accumulates in queues: + +- Monitor queue sizes to prevent memory issues +- Adjust intervals if queues grow too large +- Balance batch size with interval duration + +### Document Timezone Assumptions + +All schedules use system local time: + +```yaml +# Runs at midnight system local time +schedule: "0 0 * * *" +``` + +Always document expected timezone in configuration comments. + +### Test Schedule Expressions + +Validate cron expressions before deployment: + +- Use online cron validators +- Test in development environment first +- Document intended execution times + +### Plan for Failures + +Schedule-based components don't retry immediately: + +- Failed executions wait for next scheduled time +- Ensure monitoring alerts for execution failures +- Consider redundant routes for critical data + +## Performance Impact + +### Memory Considerations + +Longer intervals mean larger queues: + +- **Short intervals (< 5m)**: Smaller queues, higher CPU +- **Long intervals (> 1h)**: Larger queues, lower CPU +- Balance based on available memory and data volume + +### Network Impact + +Scheduling affects network patterns: + +- **Realtime**: Constant network usage +- **Short intervals**: Frequent bursts +- **Long intervals**: Large periodic bursts +- Plan network capacity accordingly + +### Processing Latency + +Consider end-to-end latency requirements: + +- **Realtime needs**: Omit scheduling +- **Near-realtime (< 5m)**: Short intervals +- **Batch processing**: Longer intervals or schedules +- **Compliance**: Schedule-based with guaranteed execution times + +## Migration Strategies + +### From Realtime to Scheduled + +Gradually introduce scheduling to reduce costs: + +1. Start with realtime (no scheduling) +2. Add long intervals (1h+) during testing +3. Refine intervals based on actual needs +4. Migrate to schedules for specific timing requirements + +### From Scheduled to Realtime + +Increase processing frequency for better latency: + +1. Start with daily schedules +2. Move to hourly intervals +3. Reduce to 15-minute intervals +4. Remove scheduling for realtime processing + +## Summary + +Time-based execution provides essential control over telemetry pipeline behavior. By combining schedules and intervals with routes and targets, you can: + +- Optimize resource usage and reduce costs +- Implement sophisticated data lifecycle policies +- Align processing with business requirements +- Create resilient, efficient data pipelines + +Start with simple intervals for regular processing, then add schedule-based logic for complex time-aware workflows. + +**Next Steps:** +- Read Schedule Documentation for cron-based execution +- Read Interval Documentation for frequency-based execution +- Explore target-specific documentation for integration details \ No newline at end of file diff --git a/docs/configuration/scheduling/schedule.mdx b/docs/configuration/scheduling/schedule.mdx new file mode 100644 index 00000000..f22e6240 --- /dev/null +++ b/docs/configuration/scheduling/schedule.mdx @@ -0,0 +1,440 @@ +# Schedule + +## Synopsis + +Schedule allows you to execute targets and routes at specific times using cron expressions. Instead of running continuously, the component will only process data when the schedule condition is met. + +## Schema + +### Targets + +```yaml +targets: + - name: + type: + schedule: + properties: + # target-specific properties +``` + +### Routes + +```yaml +routes: + - name: + schedule: + source: + destination: +``` + +## Configuration + +|Field|Required|Default|Description| +|---|---|---|---| +|`schedule`|N|-|Cron expression defining when the component should execute| + +## Details + +The schedule field accepts standard cron expressions to control when a target or route executes. This is useful for scenarios where you want to: + +- Process data at specific times of day +- Run components during off-peak hours +- Align execution with business schedules +- Reduce resource consumption by limiting execution windows +- Create time-based routing logic + +When a schedule is configured, the component will only process queued data when the current time matches the cron expression. Between scheduled executions, data accumulates in the queue and is processed during the next scheduled window. + +### Cron Expression Format + +Standard cron format with five fields: + +``` +┌───────────── minute (0-59) +│ ┌───────────── hour (0-23) +│ │ ┌───────────── day of month (1-31) +│ │ │ ┌───────────── month (1-12) +│ │ │ │ ┌───────────── day of week (0-6, Sunday=0) +│ │ │ │ │ +* * * * * +``` + +### Special Characters + +- `*` - Any value +- `,` - Value list separator +- `-` - Range of values +- `/` - Step values + +### Time Zone + +All cron expressions are evaluated in the system's local timezone. + +## Target Examples + +### Every Hour + + + + Execute target at the start of every hour... + + + ```yaml + targets: + - name: hourly_splunk + type: splunk + schedule: "0 * * * *" + properties: + endpoints: + - endpoint: "https://splunk.example.com:8088/services/collector" + token: "YOUR-TOKEN" + index: "logs" + ``` + + + +### Business Hours Only + + + + Execute every 30 minutes between 9 AM and 5 PM on weekdays... + + + ```yaml + targets: + - name: business_hours_elastic + type: elastic + schedule: "*/30 9-17 * * 1-5" + properties: + index: "business-logs" + endpoints: + - endpoint: "http://elasticsearch:9200" + ``` + + + +### Daily at Midnight + + + + Execute once per day at midnight... + + + ```yaml + targets: + - name: daily_archive + type: awss3 + schedule: "0 0 * * *" + properties: + bucket: "daily-archives" + region: "us-east-1" + ``` + + + +### Multiple Times Per Day + + + + Execute at 6 AM, 12 PM, and 6 PM every day... + + + ```yaml + targets: + - name: three_times_daily + type: splunk + schedule: "0 6,12,18 * * *" + properties: + endpoints: + - endpoint: "https://splunk.example.com:8088/services/collector" + token: "YOUR-TOKEN" + index: "scheduled-logs" + ``` + + + +### Every 15 Minutes + + + + Execute every 15 minutes throughout the day... + + + ```yaml + targets: + - name: frequent_elastic + type: elastic + schedule: "*/15 * * * *" + properties: + index: "frequent-logs" + endpoints: + - endpoint: "http://elasticsearch:9200" + ``` + + + +### Weekly on Mondays + + + + Execute every Monday at 9 AM... + + + ```yaml + targets: + - name: weekly_report + type: splunk + schedule: "0 9 * * 1" + properties: + endpoints: + - endpoint: "https://splunk.example.com:8088/services/collector" + token: "YOUR-TOKEN" + index: "weekly-reports" + ``` + + + +### Month End + + + + Execute on the last day of every month at midnight... + + + ```yaml + targets: + - name: month_end_processing + type: bigquery + schedule: "0 0 28-31 * *" + properties: + project: "my-project" + dataset: "monthly_data" + tables: + - "logs" + ``` + + + +### Night Processing + + + + Execute every 2 hours during night time (10 PM to 6 AM)... + + + ```yaml + targets: + - name: night_batch + type: clickhouse + schedule: "0 22,0,2,4,6 * * *" + properties: + connection_string: "clickhouse://localhost:9000" + table: "night_logs" + ``` + + + +### Weekends Only + + + + Execute every 4 hours on Saturday and Sunday... + + + ```yaml + targets: + - name: weekend_processing + type: elastic + schedule: "0 */4 * * 0,6" + properties: + index: "weekend-logs" + endpoints: + - endpoint: "http://elasticsearch:9200" + ``` + + + +### Specific Day of Month + + + + Execute on the 1st and 15th of every month at noon... + + + ```yaml + targets: + - name: bi_monthly + type: splunk + schedule: "0 12 1,15 * *" + properties: + endpoints: + - endpoint: "https://splunk.example.com:8088/services/collector" + token: "YOUR-TOKEN" + index: "bi-monthly-logs" + ``` + + + +## Route Examples + +### Business Hours Routing + + + + Route data to a specific target only during business hours... + + + ```yaml + routes: + - name: business_hours_route + schedule: "* 9-17 * * 1-5" + source: "application_logs" + destination: "realtime_splunk" + + - name: off_hours_route + source: "application_logs" + destination: "batch_storage" + ``` + + + +### Daily Archive Route + + + + Route data to archive storage once per day at midnight... + + + ```yaml + routes: + - name: daily_archive_route + schedule: "0 0 * * *" + source: "processed_logs" + destination: "s3_archive" + + - name: realtime_route + source: "processed_logs" + destination: "elasticsearch" + ``` + + + +### Weekend Backup Route + + + + Route data to backup target only on weekends... + + + ```yaml + routes: + - name: weekend_backup + schedule: "0 */6 * * 0,6" + source: "production_logs" + destination: "backup_target" + + - name: primary_route + source: "production_logs" + destination: "primary_target" + ``` + + + +### Peak Hours Dual Routing + + + + Send data to multiple targets during peak hours for redundancy... + + + ```yaml + routes: + - name: peak_hours_primary + schedule: "* 8-18 * * 1-5" + source: "critical_logs" + destination: "splunk_primary" + + - name: peak_hours_secondary + schedule: "* 8-18 * * 1-5" + source: "critical_logs" + destination: "splunk_secondary" + + - name: off_peak_route + source: "critical_logs" + destination: "splunk_primary" + ``` + + + +### Hourly Aggregation Route + + + + Route aggregated data every hour while streaming raw data continuously... + + + ```yaml + routes: + - name: hourly_aggregates + schedule: "0 * * * *" + source: "metrics" + destination: "bigquery_aggregates" + + - name: realtime_metrics + source: "metrics" + destination: "elasticsearch" + ``` + + + +## Common Patterns + +| Pattern | Expression | Description | +|---------|-----------|-------------| +| Every minute | `* * * * *` | Run every minute | +| Every 5 minutes | `*/5 * * * *` | Run every 5 minutes | +| Every 10 minutes | `*/10 * * * *` | Run every 10 minutes | +| Every 30 minutes | `*/30 * * * *` | Run every 30 minutes | +| Every hour | `0 * * * *` | Run at the start of every hour | +| Every 2 hours | `0 */2 * * *` | Run every 2 hours | +| Every 6 hours | `0 */6 * * *` | Run every 6 hours | +| Daily at midnight | `0 0 * * *` | Run once per day at midnight | +| Daily at noon | `0 12 * * *` | Run once per day at noon | +| Weekly (Monday) | `0 0 * * 1` | Run every Monday at midnight | +| Monthly (1st) | `0 0 1 * *` | Run on the 1st of each month | +| Weekdays only | `0 9 * * 1-5` | Run weekdays at 9 AM | +| Weekends only | `0 9 * * 0,6` | Run weekends at 9 AM | + +## Usage Notes + +- Schedule takes precedence over interval if both are configured +- Data queues up between scheduled executions +- First execution occurs at the next matching time after component initialization +- Failed executions do not trigger immediate retries, they wait for the next scheduled time +- Use schedule for predictable, time-based execution patterns +- For high-frequency or continuous processing, omit schedule or use interval instead +- Multiple routes can have different schedules, creating time-based routing logic + +## Troubleshooting + +### Component Not Executing + +Check that: +- Cron expression is valid +- System timezone matches your expectations +- Component status is enabled +- Queue contains data to process + +### Missed Executions + +If executions are being skipped: +- Verify the cron expression matches your intended schedule +- Check system logs for errors during execution attempts +- Ensure previous execution completed before next scheduled time + +### Timezone Issues + +All schedules use the system's local timezone. If your logs show unexpected execution times: +- Verify system timezone configuration +- Consider adjusting cron expression to match your timezone +- Document timezone expectations in component descriptions \ No newline at end of file diff --git a/docs/configuration/targets/elasticsearch.mdx b/docs/configuration/targets/elasticsearch.mdx index 56c837fa..71376652 100644 --- a/docs/configuration/targets/elasticsearch.mdx +++ b/docs/configuration/targets/elasticsearch.mdx @@ -4,7 +4,7 @@ ## Synopsis -Creates an **Elasticsearch** target that sends data using the **Bulk API**. Supports multiple endpoints, field normalization, and customizable batch sizing. +Creates an Elasticsearch target that sends data using the Bulk API. Supports multiple endpoints, field normalization, customizable batch sizing, and automatic load balancing across Elasticsearch nodes. ## Schema @@ -21,7 +21,6 @@ Creates an **Elasticsearch** target that sends data using the **Bulk API**. Supp timeout: insecure_skip_verify: use_compression: - version: write_action: filter_path: pipeline: @@ -30,6 +29,8 @@ Creates an **Elasticsearch** target that sends data using the **Bulk API**. Supp - endpoint: username: password: + interval: + schedule: ``` ## Configuration @@ -54,17 +55,18 @@ The following are the fields used to define the target: |`timeout`|N|`30`|Connection timeout in seconds| |`insecure_skip_verify`|N|`false`|Skip TLS certificate verification| |`use_compression`|N|`true`|Enable GZIP compression| -|`version`|N|`auto`|Elasticsearch version| |`write_action`|N|`create`|Bulk API action (`index`, `create`, `update`, `delete`)| |`filter_path`|N|`errors,items.*.error,items.*._index,items.*.status`|Response filter path| |`pipeline`|N|-|Ingest pipeline name| |`field_format`|N|-|Data normalization format. See applicable Normalization section| +|`interval`|N|realtime|Execution frequency. See Interval for details| +|`schedule`|N|-|Cron expression for scheduled execution. See Schedule for details| ### Endpoint |Field|Required|Default|Description| |---|---|---|---| -|`endpoint`|Y|-|Elasticsearch URL| +|`endpoint`|Y|-|Elasticsearch URL (automatically appends `/_bulk` if not present)| |`username`|N|-|Basic auth username| |`password`|N|-|Basic auth password| @@ -72,27 +74,106 @@ The following are the fields used to define the target: The target supports multiple endpoints, authentication, compression, and ingest pipelines. Data is batched for efficient delivery and can be automatically routed to different indices. -URLs are automatically appended with `/_bulk` if the suffix is not present. Events are batched until either the batch size or payload size limit is reached. +URLs are automatically appended with `/_bulk` if the suffix is not present. Events are batched until either the batch size or payload size limit is reached. -For load balancing or failover, events are sent to the endpoints in order, with subsequent endpoints used only if the previous ones fail. +For load balancing, events are sent to randomly selected endpoints. If an endpoint fails, the next endpoint in the randomized list is tried until successful delivery or all endpoints fail. -Each event is automatically enriched with a timestamp in RFC3339 format based on the log's epoch time. You can route events to different indices by setting the `SystemS3` field in your logs to the desired index name. +Each event is automatically enriched with a timestamp in RFC3339 format based on the log's epoch time. You can route events to different indices by setting the `index` field in a pipeline processor. :::warning -Long `timeout` values may lead to connection pooling issues. +Long timeout values may lead to connection pooling issues and increased resource consumption. ::: :::warning -Setting `max_payload_size_kb` too high might cause memory pressure. +Setting `max_payload_size_kb` too high might cause memory pressure and can exceed Elasticsearch's `http.max_content_length` setting (default 100MB). ::: +### Load Balancing and Failover + +When multiple endpoints are configured, the target uses randomized load balancing. For each batch: + +1. Endpoints are randomly shuffled +2. The batch is sent to the first endpoint +3. If it fails, the next endpoint in the shuffled list is tried +4. This continues until successful delivery or all endpoints fail + +If only some endpoints fail but delivery eventually succeeds, the batch is cleared and a partial error is logged. If all endpoints fail, the batch is retained for retry and a complete failure error is returned. + +### JSON Message Handling + +The target intelligently handles messages that are already in JSON format: + +- If a message contains an `@timestamp` field or is ECS-normalized, it's treated as a structured JSON document +- The JSON is parsed and sent as-is to Elasticsearch +- If parsing fails, the message is sent as plain text with an auto-generated timestamp + +This allows you to send both structured and unstructured logs through the same target. + +### Dynamic Index Routing + +Route events to different indices using pipeline processors by setting the `index` field: + +```yaml +pipelines: + - name: route_by_type + processors: + - set: + field: index + value: "error-logs" + if: "level == 'error'" + - set: + field: index + value: "metrics" + if: "type == 'metric'" +``` + +This allows flexible routing without creating multiple target configurations. + +### Bulk API Error Handling + +The target parses the bulk API response to detect individual document errors: + +- Uses `filter_path` to reduce response size and focus on error details +- Extracts error type, reason, and HTTP status for failed documents +- Returns detailed error messages indicating which documents failed and why + +Common errors include: +- Document version conflicts (for `create` action) +- Mapping errors (field type mismatches) +- Index not found or closed +- Pipeline failures (when using ingest pipelines) + +### Write Actions + +The `write_action` field determines how documents are indexed: + +- **`create`** (default): Only index if document doesn't exist. Fails on duplicates. +- **`index`**: Index or replace existing document. Always succeeds unless there's a system error. +- **`update`**: Update existing document. Fails if document doesn't exist. +- **`delete`**: Remove document. Use carefully. + +### Response Filtering + +The `filter_path` parameter filters the bulk API response to reduce network overhead: + +- **`errors`**: Boolean indicating if any operations failed +- **`items.*.error`**: Error details for failed operations +- **`items.*._index`**: Index name for each operation +- **`items.*.status`**: HTTP status code for each operation + +For high-volume scenarios, this filtering significantly reduces response size and parsing overhead. + ### Field Normalization The `field_format` property allows normalizing log data to standard formats: - `ecs` - Elastic Common Schema -Field normalization is applied before the logs are sent to Elasticsearch, ensuring consistent indexing and search capabilities. +Field normalization is applied before the logs are sent to Elasticsearch, ensuring consistent indexing and search capabilities. ECS normalization maps common fields to Elasticsearch's standard schema for improved compatibility with Kibana dashboards and detection rules. + +### Compression + +Compression is enabled by default and uses gzip to reduce network bandwidth. This adds minimal CPU overhead but can significantly improve throughput for high-volume scenarios. Disable compression only if you have bandwidth to spare and want to reduce CPU usage. ## Examples @@ -146,7 +227,7 @@ In production environments, setting `insecure_skip_verify` to `true` is not reco - Send data through an ingest pipeline... + Send data through an ingest pipeline for server-side processing... ```yaml @@ -167,7 +248,7 @@ In production environments, setting `insecure_skip_verify` to `true` is not reco - Optimized for high-volume data ingestion... + Optimized for high-volume data ingestion with load balancing... ```yaml @@ -183,6 +264,7 @@ In production environments, setting `insecure_skip_verify` to `true` is not reco endpoints: - endpoint: "http://es1:9200" - endpoint: "http://es2:9200" + - endpoint: "http://es3:9200" ``` @@ -206,3 +288,105 @@ In production environments, setting `insecure_skip_verify` to `true` is not reco ``` + +### Index Action + + + + Using index action to allow document updates and overwrites... + + + ```yaml + targets: + - name: index_elastic + type: elastic + properties: + index: "application-logs" + write_action: "index" + endpoints: + - endpoint: "http://elasticsearch:9200" + ``` + + + +### Minimal Response + + + + Optimize for minimal response size by filtering to only errors... + + + ```yaml + targets: + - name: minimal_elastic + type: elastic + properties: + index: "logs" + filter_path: "errors" + endpoints: + - endpoint: "http://elasticsearch:9200" + ``` + + + +## Performance Tuning + +### Batch Size vs Payload Size + +Events are batched until either limit is reached: + +- **`batch_size`**: Number of events per batch +- **`max_payload_size_kb`**: Total size in kilobytes + +Tune these based on your average event size: +- **Small events (less than 1KB)**: Increase `batch_size`, keep default `max_payload_size_kb` +- **Large events (greater than 10KB)**: Keep default `batch_size`, increase `max_payload_size_kb` +- **Mixed sizes**: Monitor both limits and adjust based on actual batch sizes + +### Timeout + +Setting appropriate timeouts helps balance reliability and performance: + +- **Short timeouts (10-30s)**: Fail fast, better for real-time scenarios +- **Long timeouts (60s+)**: More tolerant of network issues, but may cause connection pooling problems + +### Compression + +Enable compression (default) for high-volume scenarios to reduce network bandwidth. Disable only if CPU is constrained and network bandwidth is abundant. + +### Filter Path + +The default `filter_path` provides detailed error information while minimizing response size. For even better performance in high-volume scenarios with low error rates, use `filter_path: "errors"` to only return the error flag. + +## Troubleshooting + +### Bulk API Errors + +Check logs for detailed error messages including: +- Document index and position in batch +- Error type and reason +- HTTP status code + +Common issues: +- **Version conflicts**: Switch to `index` action or handle conflicts in your application +- **Mapping errors**: Ensure field types match index mapping +- **Pipeline errors**: Verify ingest pipeline configuration + +### Payload Size Exceeded + +If you see "bulk request size exceeds limit" errors: +1. Reduce `batch_size` +2. Reduce `max_payload_size_kb` +3. Check Elasticsearch's `http.max_content_length` setting + +### Partial Endpoint Failures + +If some endpoints fail but delivery succeeds, check logs for partial failure errors indicating which endpoints are problematic. Verify network connectivity and Elasticsearch node health. + +### All Endpoints Failed + +If all endpoints fail: +- Verify network connectivity +- Check Elasticsearch cluster health +- Ensure endpoints are accessible and not rate-limited +- Review Elasticsearch logs for errors \ No newline at end of file diff --git a/docs/configuration/targets/splunk-hec.mdx b/docs/configuration/targets/splunk-hec.mdx index 30b029db..78f97a61 100644 --- a/docs/configuration/targets/splunk-hec.mdx +++ b/docs/configuration/targets/splunk-hec.mdx @@ -4,7 +4,7 @@ ## Synopsis -Creates a Splunk HTTP Event Collector (HEC) target that sends events to one or more Splunk instances. Supports batching, compression, and field normalization. +Creates a Splunk HTTP Event Collector (HEC) target that sends events to one or more Splunk instances. Supports batching, compression, field normalization, and automatic load balancing across multiple endpoints. ## Schema @@ -22,12 +22,15 @@ Creates a Splunk HTTP Event Collector (HEC) target that sends events to one or m secret: index: source_type: + source: batch_size: timeout: tcp_routing: use_compression: insecure_skip_verify: field_format: + interval: + schedule: ``` ## Configuration @@ -57,6 +60,7 @@ The following are the fields used to define the target: |---|---|---|---| |`index`|N|-|Default Splunk index| |`source_type`|N|-|Default sourcetype for events| +|`source`|N|-|Default source for events| |`batch_size`|N|10000|Number of events to batch before sending| |`timeout`|N|30|Connection timeout in seconds| @@ -68,36 +72,75 @@ The following are the fields used to define the target: |`use_compression`|N|`true`|Enable gzip compression| |`insecure_skip_verify`|N|`false`|Skip TLS certificate verification| |`field_format`|N|-|Data normalization format. See applicable Normalization section| +|`interval`|N|realtime|Execution frequency. See Interval for details| +|`schedule`|N|-|Cron expression for scheduled execution. See Schedule for details| ## Details -The Splunk HEC target sends log data to Splunk using the HTTP Event Collector (HEC) protocol. It supports multiple authentication methods, batching, compression, and automatic failover between endpoints. +The Splunk HEC target sends log data to Splunk using the HTTP Event Collector (HEC) protocol. It supports multiple authentication methods, batching, compression, and automatic load balancing between endpoints. :::warning -Ensure your _HEC_ tokens have the appropriate permissions and indexes enabled in **Splunk**. Invalid tokens or insufficient permissions will result in ingestion failures. +Ensure your HEC tokens have the appropriate permissions and indexes enabled in Splunk. Invalid tokens or insufficient permissions will result in ingestion failures. ::: -Events are automatically batched and compressed by default for optimal performance. If multiple endpoints are configured, the target will try each endpoint in order until successful delivery occurs. +Events are automatically batched and compressed by default for optimal performance. When multiple endpoints are configured, the target randomly selects an endpoint for each batch to distribute load evenly across all available Splunk instances. :::warning Setting `insecure_skip_verify` to `true` is not recommended for production environments. ::: +### Load Balancing and Failover + +When multiple endpoints are configured, the target uses randomized load balancing. For each batch: + +1. Endpoints are randomly shuffled +2. The batch is sent to the first endpoint +3. If it fails, the next endpoint in the shuffled list is tried +4. This continues until successful delivery or all endpoints fail + +If only some endpoints fail but delivery eventually succeeds, the batch is cleared and a partial error is logged. If all endpoints fail, the batch is retained for retry and a complete failure error is returned. + ### Dynamic Routing -The target supports dynamic routing of events to different indexes and sourcetypes: +The target supports dynamic routing of events to different indexes, sourcetypes, and sources using pipeline processors: -- Use the `SystemS3` field in your logs to specify a custom index -- Use the `SystemS2` field to specify a custom sourcetype +- Set the `source` field in a pipeline to override the default source +- Set the `schema` field in a pipeline to override the default sourcetype +- Set the `index` field in a pipeline to override the default index This allows sending different event types to appropriate indexes without creating multiple target configurations. +Example pipeline configuration: + +```yaml +pipelines: + - name: route_by_severity + processors: + - set: + field: source + value: "production-app" + - set: + field: schema + value: "app:error" + if: "severity == 'error'" + - set: + field: index + value: "critical-logs" + if: "severity == 'critical'" +``` + +### Compression + +Compression is enabled by default and uses gzip to reduce network bandwidth. This adds minimal CPU overhead but can significantly improve throughput for high-volume scenarios. Disable compression only if you have bandwidth to spare and want to reduce CPU usage. + ### Field Normalization Field normalization helps standardize log data before sending it to Splunk, ensuring consistent data formats that can be easily correlated: - `cim` - Common Information Model +Normalization is applied before batching and sending to Splunk. + ## Examples ### Basic @@ -122,11 +165,11 @@ Field normalization helps standardize log data before sending it to Splunk, ensu -### Multiple Endpoints +### Load Balanced - Configure failover endpoints... + Configure load balancing and failover across multiple endpoints... ```yaml @@ -141,6 +184,9 @@ Field normalization helps standardize log data before sending it to Splunk, ensu - endpoint: "https://splunk2.example.com:8088/services/collector" auth_type: token token: "BACKUP-TOKEN" + - endpoint: "https://splunk3.example.com:8088/services/collector" + auth_type: token + token: "TERTIARY-TOKEN" index: "main" source_type: "vmetric" batch_size: 5000 @@ -152,7 +198,7 @@ Field normalization helps standardize log data before sending it to Splunk, ensu - Configure for high throughput... + Configure for high throughput with larger batches and extended timeout... ```yaml @@ -191,6 +237,7 @@ Field normalization helps standardize log data before sending it to Splunk, ensu auth_type: token token: "YOUR-HEC-TOKEN" index: "main" + source_type: "normalized_logs" field_format: "cim" ``` @@ -200,7 +247,7 @@ Field normalization helps standardize log data before sending it to Splunk, ensu - Using secret-based auth and TLS verification... + Using secret-based auth with TLS verification and custom source... ```yaml @@ -214,8 +261,65 @@ Field normalization helps standardize log data before sending it to Splunk, ensu secret: "YOUR-BEARER-TOKEN" index: "secure" source_type: "vmetric" + source: "production_cluster" insecure_skip_verify: false use_compression: true ``` + +### No Compression + + + + Disable compression to reduce CPU overhead when bandwidth is not a concern... + + + ```yaml + targets: + - name: uncompressed_splunk + type: splunk + properties: + endpoints: + - endpoint: "https://splunk.example.com:8088/services/collector" + auth_type: token + token: "YOUR-HEC-TOKEN" + index: "main" + source_type: "vmetric" + use_compression: false + ``` + + + +## Performance Tuning + +### Batch Size + +- **Small batches (1000-5000)**: Lower latency, more frequent network calls +- **Medium batches (10000)**: Balanced approach, suitable for most use cases +- **Large batches (20000+)**: Higher throughput, increased memory usage and latency + +### Timeout + +Setting appropriate timeouts helps balance reliability and performance: + +- **Short timeouts (10-30s)**: Fail fast, better for real-time scenarios +- **Long timeouts (60s+)**: More tolerant of network issues, but may cause connection pooling problems + +### Compression + +Enable compression (default) for high-volume scenarios to reduce network bandwidth. Disable only if CPU is constrained and network bandwidth is abundant. + +## Troubleshooting + +### Authentication Failures + +Ensure your HEC token has proper permissions and the target index exists in Splunk. + +### Partial Endpoint Failures + +If some endpoints fail but delivery succeeds, check logs for partial failure errors indicating which endpoints are problematic. + +### All Endpoints Failed + +If all endpoints fail, verify network connectivity, endpoint URLs, and Splunk HEC service status. \ No newline at end of file From 89f806f170676fe7f9576d0db5feb56476b571ef Mon Sep 17 00:00:00 2001 From: Yusuf Ozturk Date: Fri, 24 Oct 2025 23:22:54 +0200 Subject: [PATCH 03/13] Sidebar updates --- sidebars.ts | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/sidebars.ts b/sidebars.ts index d0025c18..824a6130 100644 --- a/sidebars.ts +++ b/sidebars.ts @@ -302,6 +302,15 @@ const sidebars: SidebarsConfig = { ], }, 'configuration/routes', + { + type: "category", + label: "Scheduling and Time-Based Execution", + items: [ + "configuration/scheduling/overview", + "configuration/scheduling/schedule", + "configuration/scheduling/interval", + ] + }, ], }, { From 14fb31530b15398ed95ce37127f3ba7189abd36a Mon Sep 17 00:00:00 2001 From: Yusuf Ozturk Date: Fri, 24 Oct 2025 23:57:33 +0200 Subject: [PATCH 04/13] Small fixes --- docs/configuration/targets/elasticsearch.mdx | 2 +- docs/configuration/targets/splunk-hec.mdx | 2 +- topics.json | 2 ++ 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/configuration/targets/elasticsearch.mdx b/docs/configuration/targets/elasticsearch.mdx index 71376652..0ca10802 100644 --- a/docs/configuration/targets/elasticsearch.mdx +++ b/docs/configuration/targets/elasticsearch.mdx @@ -1,6 +1,6 @@ # Elasticsearch -ExperimentalObservability +Observability ## Synopsis diff --git a/docs/configuration/targets/splunk-hec.mdx b/docs/configuration/targets/splunk-hec.mdx index 78f97a61..21dcae59 100644 --- a/docs/configuration/targets/splunk-hec.mdx +++ b/docs/configuration/targets/splunk-hec.mdx @@ -1,6 +1,6 @@ # Splunk HEC -ExperimentalObservability +Observability ## Synopsis diff --git a/topics.json b/topics.json index 406b6218..9d8fc873 100644 --- a/topics.json +++ b/topics.json @@ -27,6 +27,8 @@ "routes": "/configuration/routes", "routes-config": "/configuration/routes#configuration", "routes-implementation-strategies": "/configuration/routes#implementation-strategies", + "schedule": "/configuration/scheduling/schedule", + "interval": "/configuration/scheduling/interval", "handling-failures": "/configuration/pipelines/handling-failures", "handling-success": "/configuration/pipelines/handling-success", "normalization": "/configuration/pipelines/normalization", From 7e53b0bfe608a51090969dac7864c590c1437e70 Mon Sep 17 00:00:00 2001 From: Yusuf Ozturk Date: Sat, 25 Oct 2025 00:25:20 +0200 Subject: [PATCH 05/13] =?UTF-8?q?Scheduling=20=C4=B1pdates?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../scheduling/{schedule.mdx => cron.mdx} | 16 +++++++++++++--- .../scheduling/{interval.mdx => frequency.mdx} | 14 ++++++++++++-- docs/configuration/scheduling/overview.mdx | 14 +++++++------- docs/configuration/targets/elasticsearch.mdx | 4 ++-- docs/configuration/targets/splunk-hec.mdx | 4 ++-- sidebars.ts | 6 +++--- topics.json | 4 ++-- 7 files changed, 41 insertions(+), 21 deletions(-) rename docs/configuration/scheduling/{schedule.mdx => cron.mdx} (96%) rename docs/configuration/scheduling/{interval.mdx => frequency.mdx} (97%) diff --git a/docs/configuration/scheduling/schedule.mdx b/docs/configuration/scheduling/cron.mdx similarity index 96% rename from docs/configuration/scheduling/schedule.mdx rename to docs/configuration/scheduling/cron.mdx index f22e6240..ee24aed2 100644 --- a/docs/configuration/scheduling/schedule.mdx +++ b/docs/configuration/scheduling/cron.mdx @@ -1,8 +1,8 @@ -# Schedule +# Cron-Based ## Synopsis -Schedule allows you to execute targets and routes at specific times using cron expressions. Instead of running continuously, the component will only process data when the schedule condition is met. +Cron-Based scheduling allows you to execute targets and routes at specific times using cron expressions. Instead of running continuously, the component will only process data when the cron condition is met. ## Schema @@ -12,8 +12,8 @@ Schedule allows you to execute targets and routes at specific times using cron e targets: - name: type: - schedule: properties: + schedule: # target-specific properties ``` @@ -29,6 +29,16 @@ routes: ## Configuration +```yaml +routes: + - name: + schedule: + source: + destination: +``` + +## Configuration + |Field|Required|Default|Description| |---|---|---|---| |`schedule`|N|-|Cron expression defining when the component should execute| diff --git a/docs/configuration/scheduling/interval.mdx b/docs/configuration/scheduling/frequency.mdx similarity index 97% rename from docs/configuration/scheduling/interval.mdx rename to docs/configuration/scheduling/frequency.mdx index 91d43963..cdd31633 100644 --- a/docs/configuration/scheduling/interval.mdx +++ b/docs/configuration/scheduling/frequency.mdx @@ -1,13 +1,23 @@ -# Interval +# Frequency-Based ## Synopsis -Interval controls how frequently a target or route executes, measured in time units or seconds. This provides simple, periodic execution without the complexity of cron expressions. +Frequency controls how frequently a target or route executes, measured in time units or seconds. This provides simple, periodic execution without the complexity of cron expressions. ## Schema ### Targets +```yaml +targets: + - name: + type: + properties: + frequency: + # target-specific properties +``` + +### Routes ```yaml targets: - name: diff --git a/docs/configuration/scheduling/overview.mdx b/docs/configuration/scheduling/overview.mdx index 5b75e960..ae7dc561 100644 --- a/docs/configuration/scheduling/overview.mdx +++ b/docs/configuration/scheduling/overview.mdx @@ -1,4 +1,4 @@ -# Scheduling and Time-Based Execution +# Overview ## Overview @@ -44,7 +44,7 @@ Minimize expenses by controlling when and how often components execute: The platform provides two complementary approaches to time-based execution: -### Schedule (Cron-Based) +### Cron-Based Execute components at specific times using cron expressions. @@ -54,9 +54,9 @@ Execute components at specific times using cron expressions. - Calendar-based patterns (monthly reports, weekend processing) - Complex time windows -**Learn more:** Schedule Documentation +**Learn more:** Cron Documentation -### Interval (Frequency-Based) +### Frequency-Based Execute components at regular intervals using simple time units. @@ -66,7 +66,7 @@ Execute components at regular intervals using simple time units. - Predictable resource consumption - Straightforward configuration -**Learn more:** Interval Documentation +**Learn more:** Frequency Documentation ## Use Cases and Scenarios @@ -513,6 +513,6 @@ Time-based execution provides essential control over telemetry pipeline behavior Start with simple intervals for regular processing, then add schedule-based logic for complex time-aware workflows. **Next Steps:** -- Read Schedule Documentation for cron-based execution -- Read Interval Documentation for frequency-based execution +- Read Cron Documentation for cron-based execution +- Read Frequency Documentation for frequency-based execution - Explore target-specific documentation for integration details \ No newline at end of file diff --git a/docs/configuration/targets/elasticsearch.mdx b/docs/configuration/targets/elasticsearch.mdx index 0ca10802..39f3e81a 100644 --- a/docs/configuration/targets/elasticsearch.mdx +++ b/docs/configuration/targets/elasticsearch.mdx @@ -59,8 +59,8 @@ The following are the fields used to define the target: |`filter_path`|N|`errors,items.*.error,items.*._index,items.*.status`|Response filter path| |`pipeline`|N|-|Ingest pipeline name| |`field_format`|N|-|Data normalization format. See applicable Normalization section| -|`interval`|N|realtime|Execution frequency. See Interval for details| -|`schedule`|N|-|Cron expression for scheduled execution. See Schedule for details| +|`frequency`|N|realtime|Execution frequency. See Frequency for details| +|`cron`|N|-|Cron expression for scheduled execution. See Cron for details| ### Endpoint diff --git a/docs/configuration/targets/splunk-hec.mdx b/docs/configuration/targets/splunk-hec.mdx index 21dcae59..36be8158 100644 --- a/docs/configuration/targets/splunk-hec.mdx +++ b/docs/configuration/targets/splunk-hec.mdx @@ -72,8 +72,8 @@ The following are the fields used to define the target: |`use_compression`|N|`true`|Enable gzip compression| |`insecure_skip_verify`|N|`false`|Skip TLS certificate verification| |`field_format`|N|-|Data normalization format. See applicable Normalization section| -|`interval`|N|realtime|Execution frequency. See Interval for details| -|`schedule`|N|-|Cron expression for scheduled execution. See Schedule for details| +|`frequency`|N|realtime|Execution frequency. See Frequency for details| +|`cron`|N|-|Cron expression for scheduled execution. See Cron for details| ## Details diff --git a/sidebars.ts b/sidebars.ts index 824a6130..50bad0a3 100644 --- a/sidebars.ts +++ b/sidebars.ts @@ -304,11 +304,11 @@ const sidebars: SidebarsConfig = { 'configuration/routes', { type: "category", - label: "Scheduling and Time-Based Execution", + label: "Scheduling", items: [ "configuration/scheduling/overview", - "configuration/scheduling/schedule", - "configuration/scheduling/interval", + "configuration/scheduling/cron", + "configuration/scheduling/frequency", ] }, ], diff --git a/topics.json b/topics.json index 9d8fc873..2498d430 100644 --- a/topics.json +++ b/topics.json @@ -27,8 +27,8 @@ "routes": "/configuration/routes", "routes-config": "/configuration/routes#configuration", "routes-implementation-strategies": "/configuration/routes#implementation-strategies", - "schedule": "/configuration/scheduling/schedule", - "interval": "/configuration/scheduling/interval", + "cron": "/configuration/scheduling/cron", + "frequency": "/configuration/scheduling/frequency", "handling-failures": "/configuration/pipelines/handling-failures", "handling-success": "/configuration/pipelines/handling-success", "normalization": "/configuration/pipelines/normalization", From fddcff70ba238bcda8dfe642c96dab877dc32cef Mon Sep 17 00:00:00 2001 From: Yusuf Ozturk Date: Sun, 26 Oct 2025 10:18:12 +0100 Subject: [PATCH 06/13] Scheduler documentation --- docs/configuration/scheduling/cron.mdx | 73 ++++++------- .../{frequency.mdx => interval.mdx} | 4 +- docs/configuration/scheduling/overview.mdx | 101 ++++++++++-------- sidebars.ts | 2 +- topics.json | 2 +- 5 files changed, 95 insertions(+), 87 deletions(-) rename docs/configuration/scheduling/{frequency.mdx => interval.mdx} (98%) diff --git a/docs/configuration/scheduling/cron.mdx b/docs/configuration/scheduling/cron.mdx index ee24aed2..1861ba6d 100644 --- a/docs/configuration/scheduling/cron.mdx +++ b/docs/configuration/scheduling/cron.mdx @@ -1,8 +1,8 @@ -# Cron-Based +# Cron ## Synopsis -Cron-Based scheduling allows you to execute targets and routes at specific times using cron expressions. Instead of running continuously, the component will only process data when the cron condition is met. +Cron-based execution allows targets and routes to run at specific times using cron expressions. Instead of running continuously, the component will only process data when the cron schedule condition is met. ## Schema @@ -13,7 +13,7 @@ targets: - name: type: properties: - schedule: + cron: # target-specific properties ``` @@ -22,30 +22,21 @@ targets: ```yaml routes: - name: - schedule: - source: - destination: -``` - -## Configuration - -```yaml -routes: - - name: - schedule: source: destination: + properties: + cron: ``` ## Configuration |Field|Required|Default|Description| |---|---|---|---| -|`schedule`|N|-|Cron expression defining when the component should execute| +|`cron`|N|-|Cron expression defining when the component should execute| ## Details -The schedule field accepts standard cron expressions to control when a target or route executes. This is useful for scenarios where you want to: +The cron field accepts standard cron expressions to control when a target or route executes. This is useful for scenarios where you want to: - Process data at specific times of day - Run components during off-peak hours @@ -53,7 +44,7 @@ The schedule field accepts standard cron expressions to control when a target or - Reduce resource consumption by limiting execution windows - Create time-based routing logic -When a schedule is configured, the component will only process queued data when the current time matches the cron expression. Between scheduled executions, data accumulates in the queue and is processed during the next scheduled window. +When a cron schedule is configured, the component will only process queued data when the current time matches the cron expression. Between scheduled executions, data accumulates in the queue and is processed during the next scheduled window. ### Cron Expression Format @@ -93,8 +84,8 @@ All cron expressions are evaluated in the system's local timezone. targets: - name: hourly_splunk type: splunk - schedule: "0 * * * *" properties: + cron: "0 * * * *" endpoints: - endpoint: "https://splunk.example.com:8088/services/collector" token: "YOUR-TOKEN" @@ -114,8 +105,8 @@ All cron expressions are evaluated in the system's local timezone. targets: - name: business_hours_elastic type: elastic - schedule: "*/30 9-17 * * 1-5" properties: + cron: "*/30 9-17 * * 1-5" index: "business-logs" endpoints: - endpoint: "http://elasticsearch:9200" @@ -134,8 +125,8 @@ All cron expressions are evaluated in the system's local timezone. targets: - name: daily_archive type: awss3 - schedule: "0 0 * * *" properties: + cron: "0 0 * * *" bucket: "daily-archives" region: "us-east-1" ``` @@ -153,8 +144,8 @@ All cron expressions are evaluated in the system's local timezone. targets: - name: three_times_daily type: splunk - schedule: "0 6,12,18 * * *" properties: + cron: "0 6,12,18 * * *" endpoints: - endpoint: "https://splunk.example.com:8088/services/collector" token: "YOUR-TOKEN" @@ -174,8 +165,8 @@ All cron expressions are evaluated in the system's local timezone. targets: - name: frequent_elastic type: elastic - schedule: "*/15 * * * *" properties: + cron: "*/15 * * * *" index: "frequent-logs" endpoints: - endpoint: "http://elasticsearch:9200" @@ -194,8 +185,8 @@ All cron expressions are evaluated in the system's local timezone. targets: - name: weekly_report type: splunk - schedule: "0 9 * * 1" properties: + cron: "0 9 * * 1" endpoints: - endpoint: "https://splunk.example.com:8088/services/collector" token: "YOUR-TOKEN" @@ -215,8 +206,8 @@ All cron expressions are evaluated in the system's local timezone. targets: - name: month_end_processing type: bigquery - schedule: "0 0 28-31 * *" properties: + cron: "0 0 28-31 * *" project: "my-project" dataset: "monthly_data" tables: @@ -236,8 +227,8 @@ All cron expressions are evaluated in the system's local timezone. targets: - name: night_batch type: clickhouse - schedule: "0 22,0,2,4,6 * * *" properties: + cron: "0 22,0,2,4,6 * * *" connection_string: "clickhouse://localhost:9000" table: "night_logs" ``` @@ -255,8 +246,8 @@ All cron expressions are evaluated in the system's local timezone. targets: - name: weekend_processing type: elastic - schedule: "0 */4 * * 0,6" properties: + cron: "0 */4 * * 0,6" index: "weekend-logs" endpoints: - endpoint: "http://elasticsearch:9200" @@ -275,8 +266,8 @@ All cron expressions are evaluated in the system's local timezone. targets: - name: bi_monthly type: splunk - schedule: "0 12 1,15 * *" properties: + cron: "0 12 1,15 * *" endpoints: - endpoint: "https://splunk.example.com:8088/services/collector" token: "YOUR-TOKEN" @@ -297,9 +288,10 @@ All cron expressions are evaluated in the system's local timezone. ```yaml routes: - name: business_hours_route - schedule: "* 9-17 * * 1-5" source: "application_logs" destination: "realtime_splunk" + properties: + cron: "* 9-17 * * 1-5" - name: off_hours_route source: "application_logs" @@ -318,9 +310,10 @@ All cron expressions are evaluated in the system's local timezone. ```yaml routes: - name: daily_archive_route - schedule: "0 0 * * *" source: "processed_logs" destination: "s3_archive" + properties: + cron: "0 0 * * *" - name: realtime_route source: "processed_logs" @@ -339,9 +332,10 @@ All cron expressions are evaluated in the system's local timezone. ```yaml routes: - name: weekend_backup - schedule: "0 */6 * * 0,6" source: "production_logs" destination: "backup_target" + properties: + cron: "0 */6 * * 0,6" - name: primary_route source: "production_logs" @@ -360,14 +354,16 @@ All cron expressions are evaluated in the system's local timezone. ```yaml routes: - name: peak_hours_primary - schedule: "* 8-18 * * 1-5" source: "critical_logs" destination: "splunk_primary" + properties: + cron: "* 8-18 * * 1-5" - name: peak_hours_secondary - schedule: "* 8-18 * * 1-5" source: "critical_logs" destination: "splunk_secondary" + properties: + cron: "* 8-18 * * 1-5" - name: off_peak_route source: "critical_logs" @@ -386,9 +382,10 @@ All cron expressions are evaluated in the system's local timezone. ```yaml routes: - name: hourly_aggregates - schedule: "0 * * * *" source: "metrics" destination: "bigquery_aggregates" + properties: + cron: "0 * * * *" - name: realtime_metrics source: "metrics" @@ -417,13 +414,13 @@ All cron expressions are evaluated in the system's local timezone. ## Usage Notes -- Schedule takes precedence over interval if both are configured +- Cron takes precedence over interval if both are configured - Data queues up between scheduled executions - First execution occurs at the next matching time after component initialization - Failed executions do not trigger immediate retries, they wait for the next scheduled time -- Use schedule for predictable, time-based execution patterns -- For high-frequency or continuous processing, omit schedule or use interval instead -- Multiple routes can have different schedules, creating time-based routing logic +- Use cron for predictable, time-based execution patterns +- For high-frequency or continuous processing, omit cron or use interval instead +- Multiple routes can have different cron schedules, creating time-based routing logic ## Troubleshooting @@ -444,7 +441,7 @@ If executions are being skipped: ### Timezone Issues -All schedules use the system's local timezone. If your logs show unexpected execution times: +All cron schedules use the system's local timezone. If your logs show unexpected execution times: - Verify system timezone configuration - Consider adjusting cron expression to match your timezone - Document timezone expectations in component descriptions \ No newline at end of file diff --git a/docs/configuration/scheduling/frequency.mdx b/docs/configuration/scheduling/interval.mdx similarity index 98% rename from docs/configuration/scheduling/frequency.mdx rename to docs/configuration/scheduling/interval.mdx index cdd31633..0cbbe484 100644 --- a/docs/configuration/scheduling/frequency.mdx +++ b/docs/configuration/scheduling/interval.mdx @@ -1,8 +1,8 @@ -# Frequency-Based +# Interval-Based ## Synopsis -Frequency controls how frequently a target or route executes, measured in time units or seconds. This provides simple, periodic execution without the complexity of cron expressions. +Interval controls how frequently a target or route executes, measured in time units or seconds. This provides simple, periodic execution without the complexity of cron expressions. ## Schema diff --git a/docs/configuration/scheduling/overview.mdx b/docs/configuration/scheduling/overview.mdx index ae7dc561..dfd7b233 100644 --- a/docs/configuration/scheduling/overview.mdx +++ b/docs/configuration/scheduling/overview.mdx @@ -2,7 +2,7 @@ ## Overview -Scheduling capabilities provide powerful control over when and how often your telemetry pipeline components execute. By adding temporal logic to targets and routes, you can optimize resource usage, reduce costs, implement time-based routing strategies, and align data processing with business requirements. +Execution timing capabilities provide powerful control over when and how often your telemetry pipeline components execute. By adding temporal logic to targets and routes, you can optimize resource usage, reduce costs, implement time-based routing strategies, and align data processing with business requirements. ## What You Can Do @@ -40,11 +40,11 @@ Minimize expenses by controlling when and how often components execute: - Use cheaper storage during off-peak hours - Implement cost-effective retention policies -## Scheduling Methods +## Timing Methods The platform provides two complementary approaches to time-based execution: -### Cron-Based +### Cron Execute components at specific times using cron expressions. @@ -56,7 +56,7 @@ Execute components at specific times using cron expressions. **Learn more:** Cron Documentation -### Frequency-Based +### Interval Execute components at regular intervals using simple time units. @@ -66,7 +66,7 @@ Execute components at regular intervals using simple time units. - Predictable resource consumption - Straightforward configuration -**Learn more:** Frequency Documentation +**Learn more:** Interval Documentation ## Use Cases and Scenarios @@ -78,16 +78,18 @@ Route data to different destinations based on time: # Send to realtime analytics during business hours routes: - name: business_hours_analytics - schedule: "* 9-17 * * 1-5" source: "application_logs" destination: "realtime_splunk" + properties: + cron: "* 9-17 * * 1-5" # Archive all data once per day routes: - name: daily_archive - schedule: "0 0 * * *" source: "application_logs" destination: "s3_archive" + properties: + cron: "0 0 * * *" # Default realtime processing routes: @@ -129,8 +131,8 @@ targets: targets: - name: cold_tier type: awss3 - schedule: "0 2 * * *" properties: + cron: "0 2 * * *" bucket: "logs-cold" region: "us-east-1" ``` @@ -148,16 +150,18 @@ Process critical data differently during business hours: # High priority during business hours routes: - name: business_hours_critical - schedule: "* 9-17 * * 1-5" source: "error_logs" destination: "alert_system" + properties: + cron: "* 9-17 * * 1-5" # Batch processing after hours routes: - name: after_hours_batch - schedule: "0 18-8 * * *" source: "error_logs" destination: "batch_processor" + properties: + cron: "0 18-8 * * *" ``` **Benefits:** @@ -183,8 +187,8 @@ targets: targets: - name: compliance_archive type: awss3 - schedule: "0 1 * * *" properties: + cron: "0 1 * * *" bucket: "logs-compliance" region: "us-east-1" @@ -192,8 +196,8 @@ targets: targets: - name: longterm_archive type: glacier - schedule: "0 3 * * 0" properties: + cron: "0 3 * * 0" vault: "logs-longterm" ``` @@ -210,16 +214,18 @@ Handle varying data volumes throughout the day: # Realtime processing during low-traffic hours routes: - name: night_realtime - schedule: "* 0-6,22-23 * * *" source: "high_volume_logs" destination: "realtime_processor" + properties: + cron: "* 0-6,22-23 * * *" # Batched processing during peak hours routes: - name: peak_hours_batch - schedule: "*/15 7-21 * * *" source: "high_volume_logs" destination: "batch_processor" + properties: + cron: "*/15 7-21 * * *" ``` **Benefits:** @@ -236,8 +242,8 @@ Distribute data across regions based on time zones: targets: - name: us_target type: splunk - schedule: "* 9-17 * * 1-5" properties: + cron: "* 9-17 * * 1-5" endpoints: - endpoint: "https://us-splunk.example.com:8088/services/collector" token: "US-TOKEN" @@ -246,8 +252,8 @@ targets: targets: - name: eu_target type: splunk - schedule: "* 9-17 * * 1-5" properties: + cron: "* 9-17 * * 1-5" endpoints: - endpoint: "https://eu-splunk.example.com:8088/services/collector" token: "EU-TOKEN" @@ -282,16 +288,16 @@ targets: targets: - name: warm_storage type: clickhouse - schedule: "0 2 * * 0" properties: + cron: "0 2 * * 0" table: "logs_archive" # Monthly move to cold storage targets: - name: cold_storage type: awss3 - schedule: "0 3 1 * *" properties: + cron: "0 3 1 * *" bucket: "logs-cold-archive" ``` @@ -333,17 +339,17 @@ targets: ## Configuration Patterns -### Combining Schedule and Interval +### Combining Cron and Interval Use both for different components in the same pipeline: ```yaml -# Scheduled target for compliance +# Cron-based target for compliance targets: - name: compliance_target type: awss3 - schedule: "0 0 * * *" properties: + cron: "0 0 * * *" bucket: "compliance-logs" # Interval-based target for monitoring @@ -359,7 +365,7 @@ targets: - name: alert_target type: splunk properties: - # No schedule or interval - realtime + # No cron or interval - realtime index: "alerts" ``` @@ -371,21 +377,24 @@ Create fallback routing with time constraints: routes: # Priority 1: Business hours to premium service - name: priority_route - schedule: "* 9-17 * * 1-5" source: "logs" destination: "premium_target" + properties: + cron: "* 9-17 * * 1-5" # Priority 2: Night hours to standard service - name: night_route - schedule: "* 0-8,18-23 * * *" source: "logs" destination: "standard_target" + properties: + cron: "* 0-8,18-23 * * *" # Priority 3: Weekend to basic service - name: weekend_route - schedule: "* * * * 0,6" source: "logs" destination: "basic_target" + properties: + cron: "* * * * 0,6" # Default: fallback to archive - name: fallback_route @@ -404,17 +413,18 @@ properties: interval: "5m" ``` -Migrate to schedule when you need specific timing: +Migrate to cron when you need specific timing: ```yaml -schedule: "0 */6 * * *" +properties: + cron: "0 */6 * * *" ``` ### Consider Data Volume -Match scheduling to expected data volume: +Match timing configuration to expected data volume: -- **High volume**: Use longer intervals or specific schedules +- **High volume**: Use longer intervals or specific cron schedules - **Low volume**: Realtime or short intervals work well - **Variable volume**: Use time-based routing to handle peaks @@ -428,16 +438,17 @@ Between scheduled executions, data accumulates in queues: ### Document Timezone Assumptions -All schedules use system local time: +All cron schedules use system local time: ```yaml # Runs at midnight system local time -schedule: "0 0 * * *" +properties: + cron: "0 0 * * *" ``` Always document expected timezone in configuration comments. -### Test Schedule Expressions +### Test Cron Expressions Validate cron expressions before deployment: @@ -447,7 +458,7 @@ Validate cron expressions before deployment: ### Plan for Failures -Schedule-based components don't retry immediately: +Cron-based components don't retry immediately: - Failed executions wait for next scheduled time - Ensure monitoring alerts for execution failures @@ -465,7 +476,7 @@ Longer intervals mean larger queues: ### Network Impact -Scheduling affects network patterns: +Timing configuration affects network patterns: - **Realtime**: Constant network usage - **Short intervals**: Frequent bursts @@ -476,43 +487,43 @@ Scheduling affects network patterns: Consider end-to-end latency requirements: -- **Realtime needs**: Omit scheduling +- **Realtime needs**: Omit timing configuration - **Near-realtime (< 5m)**: Short intervals -- **Batch processing**: Longer intervals or schedules -- **Compliance**: Schedule-based with guaranteed execution times +- **Batch processing**: Longer intervals or cron schedules +- **Compliance**: Cron-based with guaranteed execution times ## Migration Strategies ### From Realtime to Scheduled -Gradually introduce scheduling to reduce costs: +Gradually introduce timing controls to reduce costs: -1. Start with realtime (no scheduling) +1. Start with realtime (no timing configuration) 2. Add long intervals (1h+) during testing 3. Refine intervals based on actual needs -4. Migrate to schedules for specific timing requirements +4. Migrate to cron schedules for specific timing requirements ### From Scheduled to Realtime Increase processing frequency for better latency: -1. Start with daily schedules +1. Start with daily cron schedules 2. Move to hourly intervals 3. Reduce to 15-minute intervals -4. Remove scheduling for realtime processing +4. Remove timing configuration for realtime processing ## Summary -Time-based execution provides essential control over telemetry pipeline behavior. By combining schedules and intervals with routes and targets, you can: +Time-based execution provides essential control over telemetry pipeline behavior. By combining cron and interval timing with routes and targets, you can: - Optimize resource usage and reduce costs - Implement sophisticated data lifecycle policies - Align processing with business requirements - Create resilient, efficient data pipelines -Start with simple intervals for regular processing, then add schedule-based logic for complex time-aware workflows. +Start with simple intervals for regular processing, then add cron-based logic for complex time-aware workflows. **Next Steps:** - Read Cron Documentation for cron-based execution -- Read Frequency Documentation for frequency-based execution +- Read Interval Documentation for interval-based execution - Explore target-specific documentation for integration details \ No newline at end of file diff --git a/sidebars.ts b/sidebars.ts index 50bad0a3..06723bea 100644 --- a/sidebars.ts +++ b/sidebars.ts @@ -308,7 +308,7 @@ const sidebars: SidebarsConfig = { items: [ "configuration/scheduling/overview", "configuration/scheduling/cron", - "configuration/scheduling/frequency", + "configuration/scheduling/interval", ] }, ], diff --git a/topics.json b/topics.json index 2498d430..ab17922d 100644 --- a/topics.json +++ b/topics.json @@ -28,7 +28,7 @@ "routes-config": "/configuration/routes#configuration", "routes-implementation-strategies": "/configuration/routes#implementation-strategies", "cron": "/configuration/scheduling/cron", - "frequency": "/configuration/scheduling/frequency", + "interval": "/configuration/scheduling/interval", "handling-failures": "/configuration/pipelines/handling-failures", "handling-success": "/configuration/pipelines/handling-success", "normalization": "/configuration/pipelines/normalization", From 206131b61f55c8753dc2fcb4cff1e34fc16894a7 Mon Sep 17 00:00:00 2001 From: Yusuf Ozturk Date: Sun, 26 Oct 2025 10:18:35 +0100 Subject: [PATCH 07/13] Update targets for v1.5.0 --- docs/configuration/targets/aws-s3.mdx | 543 ++++++++---------- .../targets/azure-blob-storage.mdx | 11 +- .../targets/azure-data-explorer.mdx | 11 +- docs/configuration/targets/bigquery.mdx | 11 +- docs/configuration/targets/clickhouse.mdx | 19 + docs/configuration/targets/console.mdx | 32 +- docs/configuration/targets/elasticsearch.mdx | 25 +- docs/configuration/targets/event-hubs.mdx | 19 + docs/configuration/targets/file.mdx | 19 + .../targets/microsoft-sentinel-data-lake.mdx | 11 +- .../targets/microsoft-sentinel.mdx | 11 +- docs/configuration/targets/overview.mdx | 180 +++++- docs/configuration/targets/splunk-hec.mdx | 35 +- docs/configuration/targets/syslog.mdx | 19 + 14 files changed, 606 insertions(+), 340 deletions(-) diff --git a/docs/configuration/targets/aws-s3.mdx b/docs/configuration/targets/aws-s3.mdx index 27535343..4d0ed843 100644 --- a/docs/configuration/targets/aws-s3.mdx +++ b/docs/configuration/targets/aws-s3.mdx @@ -1,26 +1,27 @@ ---- -description: AWS S3 target for cloud storage integration with file format support and Security Lake compatibility -sidebar_custom_props: - customCategory: "Targets" - customIcon: "☁️" ---- - # AWS S3 Amazon AWSCloud Storage ## Synopsis -AWS S3 target enables data export to Amazon Simple Storage Service (S3) buckets with support for multiple file formats (JSON, Avro, Parquet), compression options, and AWS Security Lake integration. The target handles multipart uploads for large files and supports both direct S3 uploads and Azure Function App integration for indirect uploads. +Creates a target that writes log messages to Amazon S3 buckets with support for various file formats, authentication methods, and multipart uploads. The target handles large file uploads efficiently with configurable rotation based on size or event count. ## Schema -```yaml {2-6} -targets: - - name: - type: awss3 +```yaml {1,3} +- name: + description: + type: awss3 + pipelines: + status: + properties: key: secret: + session: + region: + endpoint: + part_size: + bucket: buckets: - bucket: name: @@ -28,335 +29,273 @@ targets: compression: extension: schema: - size: - batch: - region: - endpoint: - session: - source: - account: - part_size: - function: - url: - method: + name: + format: + compression: + extension: + schema: + max_size: + batch_size: + timeout: + field_format: + interval: + cron: + debug: + status: + dont_send_logs: + drop_events: ``` ## Configuration +The following fields are used to define the target: + +|Field|Required|Default|Description| +|---|---|---|---| +|`name`|Y||Target name| +|`description`|N|-|Optional description| +|`type`|Y||Must be `awss3`| +|`pipelines`|N|-|Optional post-processor pipelines| +|`status`|N|`true`|Enable/disable the target| + ### AWS Credentials -|Parameter|Type|Required|Default|Description| -|---|---|---|---|---| -|`key`|string|Y*|-|AWS access key ID for authentication| -|`secret`|string|Y*|-|AWS secret access key for authentication| -|`session`|string|N|-|Optional session token for temporary credentials| -|`region`|string|Y|-|AWS region (e.g., `us-east-1`, `eu-west-1`)| -|`endpoint`|string|N|-|Custom S3-compatible endpoint URL (for non-AWS S3 services)| +|Field|Required|Default|Description| +|---|---|---|---| +|`key`|N*|-|AWS access key ID for authentication| +|`secret`|N*|-|AWS secret access key for authentication| +|`session`|N|-|Optional session token for temporary credentials| +|`region`|Y|-|AWS region (e.g., `us-east-1`, `eu-west-1`)| +|`endpoint`|N|-|Custom S3-compatible endpoint URL (for non-AWS S3 services)| \* = Conditionally required. AWS credentials (`key` and `secret`) are required unless using IAM role-based authentication on AWS infrastructure. ### Connection -|Parameter|Type|Required|Default|Description| -|---|---|---|---|---| -|`name`|string|Y|-|Unique identifier for the target| -|`type`|string|Y|`awss3`|Target type identifier (must be `awss3`)| -|`part_size`|integer|N|`5242880`|Multipart upload part size in bytes (minimum 5MB)| +|Field|Required|Default|Description| +|---|---|---|---| +|`part_size`|N|`5`|Multipart upload part size in megabytes (minimum 5MB)| +|`timeout`|N|`30`|Connection timeout in seconds| +|`field_format`|N|-|Data normalization format. See applicable Normalization section| ### Files -|Parameter|Type|Required|Default|Description| -|---|---|---|---|---| -|`buckets`|array|Y|-|Array of bucket configurations for file distribution| -|`buckets.bucket`|string|Y|-|S3 bucket name| -|`buckets.name`|string|Y|-|File name template (supports variables: `{date}`, `{time}`, `{unix}`, `{tag}`)| -|`buckets.format`|string|Y|-|Output format: `json`, `multijson`, `avro`, `parquet`| -|`buckets.compression`|string|N|-|Compression algorithm: `gzip`, `snappy`, `deflate`| -|`buckets.extension`|string|N|-|File extension override (defaults to format-specific extension)| -|`buckets.schema`|string|N*|-|Schema definition file path (required for Avro and Parquet formats)| -|`buckets.size`|integer|N|`10485760`|Maximum file size in bytes before rotation (10MB default)| -|`buckets.batch`|integer|N|`1000`|Maximum number of events per file| +|Field|Required|Default|Description| +|---|---|---|---| +|`bucket`|N*|-|Default S3 bucket name (used if `buckets` not specified)| +|`buckets`|N*|-|Array of bucket configurations for file distribution| +|`buckets.bucket`|Y|-|S3 bucket name| +|`buckets.name`|Y|-|File name template| +|`buckets.format`|N|`"json"`|Output format: `json`, `multijson`, `avro`, `parquet`| +|`buckets.compression`|N|`"zstd"`|Compression algorithm| +|`buckets.extension`|N|Matches `format`|File extension override| +|`buckets.schema`|N*|-|Schema definition file path (required for Avro and Parquet formats)| +|`name`|N|`"vmetric.{{.Timestamp}}.{{.Extension}}"`|Default file name template when `buckets` not used| +|`format`|N|`"json"`|Default output format when `buckets` not used| +|`compression`|N|`"zstd"`|Default compression when `buckets` not used| +|`extension`|N|Matches `format`|Default file extension when `buckets` not used| +|`schema`|N|-|Default schema path when `buckets` not used| +|`max_size`|N|`0`|Maximum file size in bytes before rotation| +|`batch_size`|N|`100000`|Maximum number of messages per file| + +\* = Either `bucket` or `buckets` must be specified. When using `buckets`, schema is conditionally required for Avro and Parquet formats. + +:::note +When `max_size` is reached, the current file is uploaded to S3 and a new file is created. For unlimited file size, set the field to `0`. +::: + +### Scheduler + +|Field|Required|Default|Description| +|---|---|---|---| +|`interval`|N|realtime|Execution frequency. See Interval for details| +|`cron`|N|-|Cron expression for scheduled execution. See Cron for details| + +### Debug Options + +|Field|Required|Default|Description| +|---|---|---|---| +|`debug.status`|N|`false`|Enable debug logging| +|`debug.dont_send_logs`|N|`false`|Process logs but don't send to target (testing)| +|`debug.drop_events`|N|`false`|Drop unknown stream events instead of failing| -\* = Conditionally required. `schema` field is required when `format` is set to `avro` or `parquet`. +## Details -### AWS Security Lake +The AWS S3 target supports writing to different buckets with various file formats and schemas. The target provides enterprise-grade cloud storage integration with comprehensive file format support. -|Parameter|Type|Required|Default|Description| -|---|---|---|---|---| -|`source`|string|N*|-|Security Lake source identifier| -|`account`|string|N*|-|AWS account ID for Security Lake| +### Authentication Methods -\* = Conditionally required. When `source`, `region`, and `account` are all provided, files use Security Lake path structure: `ext/{source}/region={region}/accountId={account}/eventDay={date}/{file}` +Supports static credentials (access key and secret key) with optional session tokens for temporary credentials. When deployed on AWS infrastructure, can leverage IAM role-based authentication without explicit credentials. -### Azure Function App Integration +### File Formats -|Parameter|Type|Required|Default|Description| -|---|---|---|---|---| -|`function.url`|string|N|-|Azure Function App endpoint URL for indirect uploads| -|`function.method`|string|N|`POST`|HTTP method for function app requests| +|Format|Description| +|---|---| +|`json`|Each log entry is written as a separate JSON line (JSONL format)| +|`multijson`|All log entries are written as a single JSON array| +|`avro`|Apache Avro format with schema| +|`parquet`|Apache Parquet columnar format with schema| -### Debug +### Compression -|Parameter|Type|Required|Default|Description| -|---|---|---|---|---| -|`description`|string|N|-|Optional description of target purpose| -|`tag`|string|N|-|Target identifier tag for routing and filtering| -|`status`|boolean|N|`true`|Enable or disable target processing| +All formats support optional compression to reduce storage costs and transfer times. Compression is applied before upload. -## Details +|Format|Compression Options| +|---|---| +|JSON/MultiJSON|`zstd` (default), `gzip`| +|Avro|`null`, `deflate`, `snappy`, `zstd`| +|Parquet|`uncompressed`, `gzip`, `snappy`, `zstd`, `brotli`, `lz4`| -The target provides enterprise-grade cloud storage integration with comprehensive file format support and AWS Security Lake compatibility. +### File Management -**Authentication Methods**: Supports static credentials (access key and secret key) with optional session tokens for temporary credentials. When deployed on AWS infrastructure, can leverage IAM role-based authentication without explicit credentials. +Files are rotated based on size (`max_size` parameter) or event count (`batch_size` parameter), whichever limit is reached first. Template variables in file names enable dynamic file naming for time-based partitioning. -**File Formats**: Supports four output formats with distinct use cases: +### Templates -- `json`: Single JSON object per file (human-readable, suitable for small datasets) -- `multijson`: Newline-delimited JSON objects (streaming format, efficient for large datasets) -- `avro`: Schema-based binary serialization (compact, schema evolution support) -- `parquet`: Columnar storage format (optimized for analytics, compression-friendly) +The following template variables can be used in file names: -**Compression Options**: All formats support optional compression (`gzip`, `snappy`, `deflate`) to reduce storage costs and transfer times. Compression is applied before upload. +|Variable|Description|Example| +|---|---|---| +|`{{.Year}}`|Current year|`2024`| +|`{{.Month}}`|Current month|`01`| +|`{{.Day}}`|Current day|`15`| +|`{{.Timestamp}}`|Current timestamp in nanoseconds|`1703688533123456789`| +|`{{.Format}}`|File format|`json`| +|`{{.Extension}}`|File extension|`json`| +|`{{.Compression}}`|Compression type|`zstd`| +|`{{.TargetName}}`|Target name|`my_logs`| +|`{{.TargetType}}`|Target type|`awss3`| +|`{{.Table}}`|Bucket name|`logs`| -**File Management**: Files are rotated based on size (`size` parameter) or event count (`batch` parameter), whichever limit is reached first. Template variables in file names (`{date}`, `{time}`, `{unix}`, `{tag}`) enable dynamic file naming for time-based partitioning. +### Multipart Upload -**Multipart Upload**: Large files automatically use S3 multipart upload protocol with configurable part size (`part_size` parameter). Default 5MB part size balances upload efficiency and memory usage. +Large files automatically use S3 multipart upload protocol with configurable part size (`part_size` parameter). Default 5MB part size balances upload efficiency and memory usage. -**Multiple Buckets**: Single target can write to multiple S3 buckets with different configurations, enabling data distribution strategies (e.g., raw data to one bucket, processed data to another). +### Multiple Buckets -**AWS Security Lake Integration**: When **source**, **region**, and **account** parameters are configured, files are uploaded using Security Lake path structure: `ext/{source}/region={region}/accountId={account}/eventDay={date}/{file}`. This enables automatic ingestion by AWS Security Lake services. +Single target can write to multiple S3 buckets with different configurations, enabling data distribution strategies (e.g., raw data to one bucket, processed data to another). -**Azure Function App Integration**: Optional indirect upload via Azure Function App endpoint. When configured, target sends file data to function app instead of directly to S3, enabling custom processing or authentication workflows. +### Schema Requirements -**Schema Requirements**: Avro and Parquet formats require schema definition files. Schema files must be accessible at the path specified in the `schema` parameter during target initialization. +Avro and Parquet formats require schema definition files. Schema files must be accessible at the path specified in the `schema` parameter during target initialization. ## Examples ### Basic Configuration - - - Configuring basic AWS S3 target with JSON output to single bucket... - - - ```yaml - targets: - - name: aws-s3-logs - type: awss3 - key: AKIAIOSFODNN7EXAMPLE - secret: wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY - region: us-east-1 - buckets: - - bucket: datastream-logs - name: "events-{date}-{time}.json" - format: json - size: 10485760 - batch: 1000 - ``` - - - Target writes JSON files to datastream-logs bucket with date/time naming... - - - ```text - S3 path: s3://datastream-logs/events-2024-01-15-103000.json - File format: JSON (single object per file) - Rotation: 10MB or 1000 events - ``` - - +The minimum configuration for a JSON S3 target: + +```yaml +targets: + - name: basic_s3 + type: awss3 + properties: + key: "AKIAIOSFODNN7EXAMPLE" + secret: "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY" + region: "us-east-1" + bucket: "datastream-logs" +``` ### Multiple Buckets - - - Distributing data across multiple S3 buckets with different formats... - - - ```yaml - targets: - - name: multi-bucket-export - type: awss3 - key: AKIAIOSFODNN7EXAMPLE - secret: wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY - region: eu-west-1 - buckets: - - bucket: raw-data-archive - name: "raw-{date}.json" - format: multijson - compression: gzip - size: 52428800 - batch: 10000 - - bucket: analytics-data - name: "analytics-{date}.parquet" - format: parquet - schema: /etc/datastream/schemas/events.parquet - compression: snappy - size: 104857600 - batch: 50000 - ``` - - - Target writes compressed JSON to raw-data-archive and Parquet to analytics-data... - - - ```text - Bucket 1: s3://raw-data-archive/raw-2024-01-15.json.gz - Format: Newline-delimited JSON, gzip compressed - - Bucket 2: s3://analytics-data/analytics-2024-01-15.parquet - Format: Parquet with Snappy compression - ``` - - - -### Parquet Format with Schema - - - - Configuring Parquet output with schema definition for analytics workloads... - - - ```yaml - targets: - - name: parquet-analytics - type: awss3 - key: AKIAIOSFODNN7EXAMPLE - secret: wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY - region: us-west-2 - buckets: - - bucket: analytics-lake - name: "events/{date}/part-{time}.parquet" - format: parquet - schema: /etc/datastream/schemas/telemetry.parquet - compression: snappy - size: 134217728 - batch: 100000 - ``` - - - Target generates Parquet files with date-based partitioning for analytics queries... - - - ```text - S3 path: s3://analytics-lake/events/2024-01-15/part-103000.parquet - Format: Parquet (columnar storage) - Compression: Snappy - Partition: Date-based directory structure - ``` - - - -### AWS Security Lake Integration - - - - Configuring target for AWS Security Lake with required path structure... - - - ```yaml - targets: - - name: security-lake-export - type: awss3 - key: AKIAIOSFODNN7EXAMPLE - secret: wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY - region: us-east-1 - source: datastream - account: "123456789012" - buckets: - - bucket: security-lake-bucket - name: "events-{unix}.parquet" - format: parquet - schema: /etc/datastream/schemas/ocsf.parquet - compression: gzip - size: 104857600 - batch: 50000 - ``` - - - Target uses Security Lake path structure for automatic ingestion... - - - ```text - S3 path: s3://security-lake-bucket/ext/datastream/region=us-east-1/accountId=123456789012/eventDay=20240115/events-1705318800.parquet - - Path structure enables AWS Security Lake automatic discovery and ingestion. - ``` - - - -### Azure Function App Integration - - - - Routing S3 uploads through Azure Function App for custom processing... - - - ```yaml - targets: - - name: function-app-s3 - type: awss3 - key: AKIAIOSFODNN7EXAMPLE - secret: wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY - region: us-east-1 - function: - url: https://my-function-app.azurewebsites.net/api/s3upload - method: POST - buckets: - - bucket: processed-data - name: "processed-{date}.json" - format: json - size: 10485760 - batch: 1000 - ``` - - - Target sends file data to Azure Function App instead of direct S3 upload... - - - ```text - Flow: DataStream → Azure Function App → AWS S3 - - Function App can perform: - - Custom authentication workflows - - Data transformation before upload - - Additional validation or processing - ``` - - - -### S3-Compatible Storage - - - - Using custom endpoint for S3-compatible storage services (MinIO, Wasabi, etc.)... - - - ```yaml - targets: - - name: minio-storage - type: awss3 - key: minioadmin - secret: minioadmin - region: us-east-1 - endpoint: https://minio.example.com:9000 - buckets: - - bucket: telemetry-data - name: "logs-{date}.json" - format: multijson - compression: gzip - size: 10485760 - batch: 5000 - ``` - - - Target connects to MinIO or other S3-compatible services using custom endpoint... - - - ```text - Storage: https://minio.example.com:9000/telemetry-data/logs-2024-01-15.json.gz - - Compatible with: MinIO, Wasabi, DigitalOcean Spaces, and other S3-compatible services - ``` - - +Configuration for distributing data across multiple S3 buckets with different formats: + +```yaml +targets: + - name: multi_bucket_export + type: awss3 + properties: + key: "AKIAIOSFODNN7EXAMPLE" + secret: "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY" + region: "eu-west-1" + buckets: + - bucket: "raw-data-archive" + name: "raw-{{.Year}}-{{.Month}}-{{.Day}}.json" + format: "multijson" + compression: "gzip" + - bucket: "analytics-data" + name: "analytics-{{.Year}}/{{.Month}}/{{.Day}}/data_{{.Timestamp}}.parquet" + format: "parquet" + schema: "" + compression: "snappy" +``` + +### Parquet Format + +Configuration for daily partitioned Parquet files: + +```yaml +targets: + - name: parquet_analytics + type: awss3 + properties: + key: "AKIAIOSFODNN7EXAMPLE" + secret: "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY" + region: "us-west-2" + bucket: "analytics-lake" + name: "events/year={{.Year}}/month={{.Month}}/day={{.Day}}/part-{{.Timestamp}}.parquet" + format: "parquet" + schema: "" + compression: "snappy" + max_size: 536870912 +``` + +### High Reliability + +Configuration with enhanced settings: + +```yaml +targets: + - name: reliable_s3 + type: awss3 + pipelines: + - checkpoint + properties: + key: "AKIAIOSFODNN7EXAMPLE" + secret: "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY" + region: "us-east-1" + bucket: "critical-logs" + name: "logs-{{.Timestamp}}.json" + format: "json" + timeout: 60 + part_size: 10 +``` + +### With Field Normalization + +Using field normalization for standard format: + +```yaml +targets: + - name: normalized_s3 + type: awss3 + properties: + key: "AKIAIOSFODNN7EXAMPLE" + secret: "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY" + region: "us-east-1" + bucket: "normalized-logs" + name: "logs-{{.Timestamp}}.json" + format: "json" + field_format: "cim" +``` + +### Debug Configuration + +Configuration with debugging enabled: + +```yaml +targets: + - name: debug_s3 + type: awss3 + properties: + key: "AKIAIOSFODNN7EXAMPLE" + secret: "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY" + region: "us-east-1" + bucket: "test-logs" + name: "test-{{.Timestamp}}.json" + format: "json" + debug: + status: true + dont_send_logs: true + drop_events: true +``` \ No newline at end of file diff --git a/docs/configuration/targets/azure-blob-storage.mdx b/docs/configuration/targets/azure-blob-storage.mdx index cd68c037..65c07ff3 100644 --- a/docs/configuration/targets/azure-blob-storage.mdx +++ b/docs/configuration/targets/azure-blob-storage.mdx @@ -32,6 +32,8 @@ Creates a target that writes log messages to _Azure Blob Storage_ with support f timeout: max_size: batch_size: + interval: + cron: debug: status: dont_send_logs: @@ -89,12 +91,19 @@ The following fields can be used for files: |`no_buffer`|N|`false`|Disable write buffering| |`field_format`|N|-|Data normalization format. See applicable Normalization section| +### Scheduler + +|Field|Required|Default|Description| +|---|---|---|---| +|`interval`|N|realtime|Execution frequency. See Interval for details| +|`cron`|N|-|Cron expression for scheduled execution. See Cron for details| + ### Debug Options |Field|Required|Default|Description| |---|---|---|---| |`debug.status`|N|`false`|Enable debug logging| -|`debug.dont_send_logs`|N|`false`|Process logs but don't send to Azure (testing)| +|`debug.dont_send_logs`|N|`false`|Process logs but don't send to target (testing)| ## Details diff --git a/docs/configuration/targets/azure-data-explorer.mdx b/docs/configuration/targets/azure-data-explorer.mdx index 0f70fbc0..ef4fefbd 100644 --- a/docs/configuration/targets/azure-data-explorer.mdx +++ b/docs/configuration/targets/azure-data-explorer.mdx @@ -32,6 +32,8 @@ Creates an Azure Data Explorer (Kusto) target that ingests data directly into Az tables: - name: schema: + interval: + cron: debug: status: dont_send_logs: @@ -91,12 +93,19 @@ targets: schema: "" ``` +### Scheduler + +|Field|Required|Default|Description| +|---|---|---|---| +|`interval`|N|realtime|Execution frequency. See Interval for details| +|`cron`|N|-|Cron expression for scheduled execution. See Cron for details| + ### Debug Options |Field|Required|Default|Description| |---|---|---|---| |`debug.status`|N|`false`|Enable debug logging| -|`debug.dont_send_logs`|N|`false`|Process logs but don't send to ADX (testing)| +|`debug.dont_send_logs`|N|`false`|Process logs but don't send to target (testing)| ## Details diff --git a/docs/configuration/targets/bigquery.mdx b/docs/configuration/targets/bigquery.mdx index ee543f55..9e07f22a 100644 --- a/docs/configuration/targets/bigquery.mdx +++ b/docs/configuration/targets/bigquery.mdx @@ -28,6 +28,8 @@ Creates a BigQuery target that streams data directly into BigQuery tables using tables: - name: schema: + interval: + cron: debug: status: dont_send_logs: @@ -101,12 +103,19 @@ Supported types: - `JSON` - JSON data - `RECORD` or `STRUCT` - Nested structure +### Scheduler + +|Field|Required|Default|Description| +|---|---|---|---| +|`interval`|N|realtime|Execution frequency. See Interval for details| +|`cron`|N|-|Cron expression for scheduled execution. See Cron for details| + ### Debug Options |Field|Required|Default|Description| |---|---|---|---| |`debug.status`|N|`false`|Enable debug logging| -|`debug.dont_send_logs`|N|`false`|Process logs but don't send to BigQuery (testing)| +|`debug.dont_send_logs`|N|`false`|Process logs but don't send to target (testing)| ## Details diff --git a/docs/configuration/targets/clickhouse.mdx b/docs/configuration/targets/clickhouse.mdx index 1032dc08..c4c6579e 100644 --- a/docs/configuration/targets/clickhouse.mdx +++ b/docs/configuration/targets/clickhouse.mdx @@ -23,6 +23,11 @@ Creates a ClickHouse target that sends log data to a ClickHouse database server table: batch_size: field_format: + interval: + cron: + debug: + status: + dont_send_logs: ``` ## Configuration @@ -55,6 +60,20 @@ The following fields are used to define the target: |`batch_size`|N|-|Number of log entries to batch before sending| |`field_format`|N|-|Data normalization format. See applicable Normalization section| +### Scheduler + +|Field|Required|Default|Description| +|---|---|---|---| +|`interval`|N|realtime|Execution frequency. See Interval for details| +|`cron`|N|-|Cron expression for scheduled execution. See Cron for details| + +### Debug Options + +|Field|Required|Default|Description| +|---|---|---|---| +|`debug.status`|N|`false`|Enable debug logging| +|`debug.dont_send_logs`|N|`false`|Process logs but don't send to target (testing)| + ## Details The ClickHouse target uses the native ClickHouse protocol to efficiently send log data in batches. Logs are accumulated until the batch size is reached, then sent to the server. The default batch size is defined by the service configuration, but can be overridden. diff --git a/docs/configuration/targets/console.mdx b/docs/configuration/targets/console.mdx index a859ed88..37bd2c44 100644 --- a/docs/configuration/targets/console.mdx +++ b/docs/configuration/targets/console.mdx @@ -16,6 +16,11 @@ Creates a console target that writes log messages to the standard output. Suppor pipelines: properties: field_format: + interval: + cron: + debug: + status: + dont_send_logs: ``` ## Configuration @@ -31,26 +36,19 @@ The following are the fields used to define the target: |`status`|N|`true`|Enable/disable the target| |`field_format`|N|-|Data normalization format. See applicable Normalization section| -Field format standards: +### Scheduler - - `ecs` - Elastic Common Schema - - `cim` - Common Information Model +|Field|Required|Default|Description| +|---|---|---|---| +|`interval`|N|realtime|Execution frequency. See Interval for details| +|`cron`|N|-|Cron expression for scheduled execution. See Cron for details| - `asim` - Advanced Security Information Model - +### Debug Options -:::note -If no field_format is specified, log messages will be written to the console without any field normalization. -::: - -:::warning -Using field normalization may impact the performance with high message volumes. -::: +|Field|Required|Default|Description| +|---|---|---|---| +|`debug.status`|N|`false`|Enable debug logging| +|`debug.dont_send_logs`|N|`false`|Process logs but don't send to target (testing)| ## Examples diff --git a/docs/configuration/targets/elasticsearch.mdx b/docs/configuration/targets/elasticsearch.mdx index 39f3e81a..e8a4859d 100644 --- a/docs/configuration/targets/elasticsearch.mdx +++ b/docs/configuration/targets/elasticsearch.mdx @@ -30,7 +30,10 @@ Creates an Elasticsearch target that sends data using the Bulk API. Supports mul username: password: interval: - schedule: + cron: + debug: + status: + dont_send_logs: ``` ## Configuration @@ -59,8 +62,6 @@ The following are the fields used to define the target: |`filter_path`|N|`errors,items.*.error,items.*._index,items.*.status`|Response filter path| |`pipeline`|N|-|Ingest pipeline name| |`field_format`|N|-|Data normalization format. See applicable Normalization section| -|`frequency`|N|realtime|Execution frequency. See Frequency for details| -|`cron`|N|-|Cron expression for scheduled execution. See Cron for details| ### Endpoint @@ -70,6 +71,20 @@ The following are the fields used to define the target: |`username`|N|-|Basic auth username| |`password`|N|-|Basic auth password| +### Scheduler + +|Field|Required|Default|Description| +|---|---|---|---| +|`interval`|N|realtime|Execution frequency. See Interval for details| +|`cron`|N|-|Cron expression for scheduled execution. See Cron for details| + +### Debug Options + +|Field|Required|Default|Description| +|---|---|---|---| +|`debug.status`|N|`false`|Enable debug logging| +|`debug.dont_send_logs`|N|`false`|Process logs but don't send to target (testing)| + ## Details The target supports multiple endpoints, authentication, compression, and ingest pipelines. Data is batched for efficient delivery and can be automatically routed to different indices. @@ -339,8 +354,8 @@ Events are batched until either limit is reached: - **`max_payload_size_kb`**: Total size in kilobytes Tune these based on your average event size: -- **Small events (less than 1KB)**: Increase `batch_size`, keep default `max_payload_size_kb` -- **Large events (greater than 10KB)**: Keep default `batch_size`, increase `max_payload_size_kb` +- **Small events (<1KB>)**: Increase `batch_size`, keep default `max_payload_size_kb` +- **Large events (>10KB)**: Keep default `batch_size`, increase `max_payload_size_kb` - **Mixed sizes**: Monitor both limits and adjust based on actual batch sizes ### Timeout diff --git a/docs/configuration/targets/event-hubs.mdx b/docs/configuration/targets/event-hubs.mdx index 99edef4d..973786ad 100644 --- a/docs/configuration/targets/event-hubs.mdx +++ b/docs/configuration/targets/event-hubs.mdx @@ -32,6 +32,11 @@ Creates a target that sends processed messages to _Azure Event Hubs_ with suppor status: cert_name: key_name: + interval: + cron: + debug: + status: + dont_send_logs: ``` ## Configuration @@ -100,6 +105,20 @@ EventHubs target supports two authentication methods: TLS certificate and key files must be placed in the service root directory. ::: +### Scheduler + +|Field|Required|Default|Description| +|---|---|---|---| +|`interval`|N|realtime|Execution frequency. See Interval for details| +|`cron`|N|-|Cron expression for scheduled execution. See Cron for details| + +### Debug Options + +|Field|Required|Default|Description| +|---|---|---|---| +|`debug.status`|N|`false`|Enable debug logging| +|`debug.dont_send_logs`|N|`false`|Process logs but don't send to target (testing)| + ## Details The EventHubs target sends processed messages to Azure Event Hubs for real-time event streaming and analytics. It supports automatic batching for optimal performance, configurable retry mechanisms for reliability, and multiple authentication methods for flexible deployment scenarios. diff --git a/docs/configuration/targets/file.mdx b/docs/configuration/targets/file.mdx index f1bba924..29f0fdd8 100644 --- a/docs/configuration/targets/file.mdx +++ b/docs/configuration/targets/file.mdx @@ -26,6 +26,11 @@ Creates a file target that writes log messages to files in various formats like batch_size: max_size: locations: + interval: + cron: + debug: + status: + dont_send_logs: ``` ## Configuration @@ -77,6 +82,20 @@ targets: format: "json" ``` +### Scheduler + +|Field|Required|Default|Description| +|---|---|---|---| +|`interval`|N|realtime|Execution frequency. See Interval for details| +|`cron`|N|-|Cron expression for scheduled execution. See Cron for details| + +### Debug Options + +|Field|Required|Default|Description| +|---|---|---|---| +|`debug.status`|N|`false`|Enable debug logging| +|`debug.dont_send_logs`|N|`false`|Process logs but don't send to target (testing)| + ## Details The file target supports writing to multiple file locations with different formats and schemas. When using `SystemS3` field in your logs, the value will be used to route the message to the location with a matching ID. diff --git a/docs/configuration/targets/microsoft-sentinel-data-lake.mdx b/docs/configuration/targets/microsoft-sentinel-data-lake.mdx index 2009afc7..5fa1d6a5 100644 --- a/docs/configuration/targets/microsoft-sentinel-data-lake.mdx +++ b/docs/configuration/targets/microsoft-sentinel-data-lake.mdx @@ -37,6 +37,8 @@ For more details on Microsoft Sentinel integration, refer to Normalization section| +### Scheduler + +|Field|Required|Default|Description| +|---|---|---|---| +|`interval`|N|realtime|Execution frequency. See Interval for details| +|`cron`|N|-|Cron expression for scheduled execution. See Cron for details| + ### Debug Options |Field|Required|Default|Description| |---|---|---|---| |`debug.status`|N|`false`|Enable debug logging| -|`debug.dont_send_logs`|N|`false`|Process logs but don't send to Sentinel (testing)| +|`debug.dont_send_logs`|N|`false`|Process logs but don't send to target (testing)| ## Details diff --git a/docs/configuration/targets/microsoft-sentinel.mdx b/docs/configuration/targets/microsoft-sentinel.mdx index fe2eaffa..6f6d44de 100644 --- a/docs/configuration/targets/microsoft-sentinel.mdx +++ b/docs/configuration/targets/microsoft-sentinel.mdx @@ -37,6 +37,8 @@ For more details on Microsoft Sentinel integration, refer to Normalization section| +### Scheduler + +|Field|Required|Default|Description| +|---|---|---|---| +|`interval`|N|realtime|Execution frequency. See Interval for details| +|`cron`|N|-|Cron expression for scheduled execution. See Cron for details| + ### Debug Options |Field|Required|Default|Description| |---|---|---|---| |`debug.status`|N|`false`|Enable debug logging| -|`debug.dont_send_logs`|N|`false`|Process logs but don't send to Sentinel (testing)| +|`debug.dont_send_logs`|N|`false`|Process logs but don't send to target (testing)| ## Automatic Table Selection diff --git a/docs/configuration/targets/overview.mdx b/docs/configuration/targets/overview.mdx index 46ed5ac8..e74acab4 100644 --- a/docs/configuration/targets/overview.mdx +++ b/docs/configuration/targets/overview.mdx @@ -62,6 +62,184 @@ The target listed here is of type `elasticsearch`. It specifies the _host_ that You can use environment variables like `${PASSWORD}` for your credentials. This will improve security by removing the credentials from your configuration file. ::: +## Debug Options + +Targets support debug configuration options for testing, troubleshooting, and development purposes. These options allow you to inspect data flow without affecting production systems. + +### Configuration + +Debug options are configured under the `debug` property within target properties: + +```yaml +targets: + - name: test_elastic + type: elastic + properties: + index: "test-logs" + endpoints: + - endpoint: "http://elasticsearch:9200" + debug: + status: true + dont_send_logs: false +``` + +### Debug Fields + +|Field|Required|Default|Description| +|---|---|---|---| +|`debug.status`|N|`false`|Enable debug logging for the target| +|`debug.dont_send_logs`|N|`false`|Prevent logs from being sent to the actual target| + +### Debug Status + +When `debug.status` is set to `true`, the target logs each event to the internal debugger before processing. This provides visibility into: + +- Message content being sent +- Device information (ID, name, type) +- Target type and operation details +- Timing and sequence of events + +Debug logs are written to the system's debug output and can be used to: + +- Verify data transformation and formatting +- Troubleshoot pipeline processing issues +- Monitor data flow in development environments +- Audit message content during testing + +### Don't Send Logs + +When `debug.dont_send_logs` is set to `true`, events are logged to the debugger but **not** sent to the actual target destination. This is useful for: + +- **Safe Testing**: Test configuration changes without affecting production systems +- **Development**: Develop and validate pipelines without external dependencies +- **Cost Control**: Avoid charges from cloud services during testing +- **Dry Runs**: Verify event formatting and routing logic before deployment + +:::warning +The `dont_send_logs` option only works when `debug.status` is also set to `true`. If debugging is disabled, logs will be sent normally regardless of the `dont_send_logs` setting. +::: + +### Use Cases + +#### Development Environment + +Test your configuration safely without sending data to production targets: + +```yaml +targets: + - name: dev_splunk + type: splunk + properties: + endpoints: + - endpoint: "https://splunk.example.com:8088/services/collector" + token: "YOUR-TOKEN" + index: "main" + debug: + status: true + dont_send_logs: true +``` + +#### Troubleshooting + +Enable debug logging to diagnose issues while still sending data: + +```yaml +targets: + - name: debug_elastic + type: elastic + properties: + index: "production-logs" + endpoints: + - endpoint: "http://elasticsearch:9200" + debug: + status: true + dont_send_logs: false +``` + +#### Pipeline Validation + +Verify pipeline transformations before enabling the target: + +```yaml +targets: + - name: validate_transformations + type: splunk + properties: + endpoints: + - endpoint: "https://splunk.example.com:8088/services/collector" + token: "YOUR-TOKEN" + field_format: "cim" + debug: + status: true + dont_send_logs: true + pipelines: + - name: test_pipeline + processors: + - set: + field: environment + value: "development" +``` + +#### Staged Deployment + +Test new target configurations in parallel with existing ones: + +```yaml +targets: + # Production target (normal operation) + - name: prod_elastic + type: elastic + properties: + index: "production-logs" + endpoints: + - endpoint: "http://prod-elasticsearch:9200" + + # Test target (debug mode, no actual sending) + - name: test_elastic + type: elastic + properties: + index: "test-logs" + endpoints: + - endpoint: "http://test-elasticsearch:9200" + debug: + status: true + dont_send_logs: true +``` + +### Best Practices + +**Disable in Production**: Always disable debug options in production environments to avoid performance overhead and excessive logging. + +**Use for Development**: Enable `dont_send_logs` during development to prevent test data from reaching production systems. + +**Temporary Troubleshooting**: Enable debug logging temporarily when investigating issues, then disable it once resolved. + +**Separate Configurations**: Maintain separate configuration files for development and production environments with appropriate debug settings. + +**Monitor Debug Output**: Ensure your logging system can handle the increased volume when debug logging is enabled. + +### Performance Considerations + +Debug logging adds overhead to target processing: + +- Each event is serialized and written to the debug log +- Additional function calls and memory allocation occur +- Log I/O operations may impact throughput + +For high-volume scenarios: +- Disable debug logging in production +- Use debug mode only for representative samples +- Monitor system resources when debugging is enabled + +### Security Notes + +Debug logs may contain sensitive information: + +- Message content is logged verbatim +- Authentication tokens are not logged, but message content might contain PII +- Ensure debug logs are secured with appropriate access controls +- Review debug output before sharing for troubleshooting + ## Deployment The following deployment types can be used: @@ -158,4 +336,4 @@ Configure intelligent routing based on data characteristics: * Route high-volume data to scalable storage targets * Send critical alerts to real-time notification endpoints -* Direct compliance data to specialized archival systems +* Direct compliance data to specialized archival systems \ No newline at end of file diff --git a/docs/configuration/targets/splunk-hec.mdx b/docs/configuration/targets/splunk-hec.mdx index 36be8158..5bedec25 100644 --- a/docs/configuration/targets/splunk-hec.mdx +++ b/docs/configuration/targets/splunk-hec.mdx @@ -21,7 +21,7 @@ Creates a Splunk HTTP Event Collector (HEC) target that sends events to one or m token: secret: index: - source_type: + sourcetype: source: batch_size: timeout: @@ -30,7 +30,10 @@ Creates a Splunk HTTP Event Collector (HEC) target that sends events to one or m insecure_skip_verify: field_format: interval: - schedule: + cron: + debug: + status: + dont_send_logs: ``` ## Configuration @@ -59,7 +62,7 @@ The following are the fields used to define the target: |Field|Required|Default|Description| |---|---|---|---| |`index`|N|-|Default Splunk index| -|`source_type`|N|-|Default sourcetype for events| +|`sourcetype`|N|-|Default sourcetype for events| |`source`|N|-|Default source for events| |`batch_size`|N|10000|Number of events to batch before sending| |`timeout`|N|30|Connection timeout in seconds| @@ -72,9 +75,21 @@ The following are the fields used to define the target: |`use_compression`|N|`true`|Enable gzip compression| |`insecure_skip_verify`|N|`false`|Skip TLS certificate verification| |`field_format`|N|-|Data normalization format. See applicable Normalization section| -|`frequency`|N|realtime|Execution frequency. See Frequency for details| + +### Scheduler + +|Field|Required|Default|Description| +|---|---|---|---| +|`interval`|N|realtime|Execution frequency. See Interval for details| |`cron`|N|-|Cron expression for scheduled execution. See Cron for details| +### Debug Options + +|Field|Required|Default|Description| +|---|---|---|---| +|`debug.status`|N|`false`|Enable debug logging| +|`debug.dont_send_logs`|N|`false`|Process logs but don't send to target (testing)| + ## Details The Splunk HEC target sends log data to Splunk using the HTTP Event Collector (HEC) protocol. It supports multiple authentication methods, batching, compression, and automatic load balancing between endpoints. @@ -160,7 +175,7 @@ Normalization is applied before batching and sending to Splunk. auth_type: token token: "YOUR-HEC-TOKEN" index: "main" - source_type: "vmetric" + sourcetype: "vmetric" ``` @@ -188,7 +203,7 @@ Normalization is applied before batching and sending to Splunk. auth_type: token token: "TERTIARY-TOKEN" index: "main" - source_type: "vmetric" + sourcetype: "vmetric" batch_size: 5000 ``` @@ -211,7 +226,7 @@ Normalization is applied before batching and sending to Splunk. auth_type: token token: "YOUR-HEC-TOKEN" index: "metrics" - source_type: "vmetric" + sourcetype: "vmetric" batch_size: 20000 timeout: 60 use_compression: true @@ -237,7 +252,7 @@ Normalization is applied before batching and sending to Splunk. auth_type: token token: "YOUR-HEC-TOKEN" index: "main" - source_type: "normalized_logs" + sourcetype: "normalized_logs" field_format: "cim" ``` @@ -260,7 +275,7 @@ Normalization is applied before batching and sending to Splunk. auth_type: secret secret: "YOUR-BEARER-TOKEN" index: "secure" - source_type: "vmetric" + sourcetype: "vmetric" source: "production_cluster" insecure_skip_verify: false use_compression: true @@ -285,7 +300,7 @@ Normalization is applied before batching and sending to Splunk. auth_type: token token: "YOUR-HEC-TOKEN" index: "main" - source_type: "vmetric" + sourcetype: "vmetric" use_compression: false ``` diff --git a/docs/configuration/targets/syslog.mdx b/docs/configuration/targets/syslog.mdx index 6db182c9..01edc2d2 100644 --- a/docs/configuration/targets/syslog.mdx +++ b/docs/configuration/targets/syslog.mdx @@ -39,6 +39,11 @@ For details of the format, see **Appendix**. verify: cert_name: key_name: + interval: + cron: + debug: + status: + dont_send_logs: ``` ## Configuration @@ -102,6 +107,20 @@ The following variables can be used in the message template: Invalid templates will fall back to sending the raw message content without formatting. ::: +### Scheduler + +|Field|Required|Default|Description| +|---|---|---|---| +|`interval`|N|realtime|Execution frequency. See Interval for details| +|`cron`|N|-|Cron expression for scheduled execution. See Cron for details| + +### Debug Options + +|Field|Required|Default|Description| +|---|---|---|---| +|`debug.status`|N|`false`|Enable debug logging| +|`debug.dont_send_logs`|N|`false`|Process logs but don't send to target (testing)| + ## Field Normalization Field normalization helps standardize log data before sending it to the syslog server, ensuring consistent data formats: From 34d6eeea2378ad0907d09ff6a09a45481812d4eb Mon Sep 17 00:00:00 2001 From: Yusuf Ozturk Date: Sun, 26 Oct 2025 11:11:08 +0100 Subject: [PATCH 08/13] S3 and AWS Security Lake --- docs/configuration/targets/aws-s3.mdx | 3 - .../targets/aws-security-lake.mdx | 420 ++++++++++++++++++ sidebars.ts | 1 + 3 files changed, 421 insertions(+), 3 deletions(-) create mode 100644 docs/configuration/targets/aws-security-lake.mdx diff --git a/docs/configuration/targets/aws-s3.mdx b/docs/configuration/targets/aws-s3.mdx index 4d0ed843..4fa7cd32 100644 --- a/docs/configuration/targets/aws-s3.mdx +++ b/docs/configuration/targets/aws-s3.mdx @@ -43,7 +43,6 @@ Creates a target that writes log messages to Amazon S3 buckets with support for debug: status: dont_send_logs: - drop_events: ``` ## Configuration @@ -117,7 +116,6 @@ When `max_size` is reached, the current file is uploaded to S3 and a new file is |---|---|---|---| |`debug.status`|N|`false`|Enable debug logging| |`debug.dont_send_logs`|N|`false`|Process logs but don't send to target (testing)| -|`debug.drop_events`|N|`false`|Drop unknown stream events instead of failing| ## Details @@ -297,5 +295,4 @@ targets: debug: status: true dont_send_logs: true - drop_events: true ``` \ No newline at end of file diff --git a/docs/configuration/targets/aws-security-lake.mdx b/docs/configuration/targets/aws-security-lake.mdx new file mode 100644 index 00000000..38f018ad --- /dev/null +++ b/docs/configuration/targets/aws-security-lake.mdx @@ -0,0 +1,420 @@ +# AWS Security Lake + +Amazon AWSSecurity + +## Synopsis + +Creates a target that writes security and compliance log data to AWS Security Lake, Amazon's purpose-built data lake for security data. The target automatically formats data in OCSF (Open Cybersecurity Schema Framework) compliant Parquet format, enabling centralized collection and analysis of security logs from various sources. AWS Security Lake requires data to be in Parquet format with OCSF schema compliance for proper ingestion and processing. + +## Schema + +```yaml +- name: + description: + type: awssecuritylake + pipelines: + status: + properties: + key: + secret: + session: + region: + endpoint: + part_size: + source: + account: + buckets: + - bucket: + name: + schema: + max_size: + batch_size: + timeout: + interval: + cron: + debug: + status: + dont_send_logs: +``` + +## Configuration + +The following fields are used to define the target: + +|Field|Required|Default|Description| +|---|---|---|---| +|`name`|Y||Target name| +|`description`|N|-|Optional description| +|`type`|Y||Must be `awssecuritylake`| +|`pipelines`|Y||Must include `aws_lake` pipeline for OCSF normalization| +|`status`|N|`true`|Enable/disable the target| + +### AWS Credentials + +|Field|Required|Default|Description| +|---|---|---|---| +|`key`|N*|-|AWS access key ID for authentication| +|`secret`|N*|-|AWS secret access key for authentication| +|`session`|N|-|Optional session token for temporary credentials| +|`region`|Y|-|AWS region where Security Lake is configured (e.g., `us-east-1`, `eu-west-1`)| +|`endpoint`|N|-|Custom endpoint URL (for testing or alternate configurations)| + +\* = Conditionally required. AWS credentials (`key` and `secret`) are required unless using IAM role-based authentication on AWS infrastructure. + +### Security Lake Configuration + +|Field|Required|Default|Description| +|---|---|---|---| +|`source`|Y|-|Custom source name identifying the data source in Security Lake| +|`account`|Y|-|AWS account ID where Security Lake is configured| +|`region`|Y|-|AWS region for Security Lake (must match the region in AWS Credentials)| + +### Connection + +|Field|Required|Default|Description| +|---|---|---|---| +|`part_size`|N|`5`|Multipart upload part size in megabytes (minimum 5MB)| +|`timeout`|N|`30`|Connection timeout in seconds| + +### Buckets + +|Field|Required|Default|Description| +|---|---|---|---| +|`buckets`|Y|-|Array of bucket configurations for different security data types| +|`buckets.bucket`|Y|-|Security Lake bucket name for specific data type| +|`buckets.name`|Y|-|File name template| +|`buckets.schema`|Y|-|OCSF schema identifier (e.g., `OCSF4001`, `OCSF3002`)| +|`max_size`|N|`0`|Maximum file size in bytes before rotation| +|`batch_size`|N|`100000`|Maximum number of messages per file| + +:::note +AWS Security Lake requires all data to be in Parquet format with OCSF schema compliance. The target automatically uses Parquet format and gzip compression as required by AWS Security Lake specifications. Each bucket must specify its corresponding OCSF schema identifier to ensure proper data routing and classification. +::: + +:::note +When `max_size` is reached, the current file is uploaded to Security Lake and a new file is created. For unlimited file size, set the field to `0`. +::: + +### Scheduler + +|Field|Required|Default|Description| +|---|---|---|---| +|`interval`|N|realtime|Execution frequency. See Interval for details| +|`cron`|N|-|Cron expression for scheduled execution. See Cron for details| + +### Debug Options + +|Field|Required|Default|Description| +|---|---|---|---| +|`debug.status`|N|`false`|Enable debug logging| +|`debug.dont_send_logs`|N|`false`|Process logs but don't send to target (testing)| + +## Details + +The AWS Security Lake target integrates with Amazon Security Lake, providing a centralized security data lake for collecting, normalizing, and analyzing security logs across your AWS environment and third-party sources. This target automatically organizes data using Security Lake's partitioning structure and ensures all data is formatted in OCSF-compliant Parquet format as required by AWS Security Lake. + +### Authentication Methods + +Supports static credentials (access key and secret key) with optional session tokens for temporary credentials. When deployed on AWS infrastructure, can leverage IAM role-based authentication without explicit credentials. The IAM role or user must have appropriate permissions to write to Security Lake buckets. + +### Security Lake Integration + +AWS Security Lake provides a centralized repository for security data, automatically normalizing logs into the Open Cybersecurity Schema Framework (OCSF) format. The target handles the required partitioning structure (`ext/{source}/region={region}/accountId={accountId}/eventDay={YYYYMMDD}/`) automatically, ensuring data is properly organized for Security Lake ingestion and analysis. + +All data is automatically written in Parquet format with gzip compression, which is the only format accepted by AWS Security Lake. The target ensures schema compliance with OCSF specifications through the specified schema identifier for each bucket. + +### Bucket Configuration Requirements + +AWS Security Lake target requires explicit bucket configuration with corresponding OCSF schema identifiers. Each bucket represents a specific security event type and must be configured with: + +- **Bucket name**: The Security Lake bucket for the specific event type +- **File name template**: Pattern for generated Parquet files +- **OCSF schema identifier**: The schema class that matches the event type + +This approach ensures proper data routing and classification, as the target needs to know which bucket corresponds to which OCSF schema to correctly process and store security events. + +### VirtualMetric AWS Security Lake Pack + +VirtualMetric provides the **Amazon Security Lake Automation and Normalization Pack**, which offers enterprise-grade normalization and routing for AWS Security Lake. This automation pack transforms diverse security data sources into OCSF-compliant format, including: + +- **Syslog messages** (native, CEF, LEEF formats) +- **Windows Security Events** and Windows Firewall logs +- **Firewall logs** from major vendors (Fortinet, Palo Alto Networks, Check Point, Cisco ASA, SonicWall, WatchGuard, Cisco Meraki) +- **Windows DNS logs** + +The pack implements a sophisticated multi-stage processing pipeline with intelligent source detection, vendor-specific optimization, and automatic OCSF schema compliance. It supports all OCSF schema classes and handles the complete transformation from source format to OCSF without requiring manual schema configuration in the pipeline. + +When using the VirtualMetric AWS Security Lake Pack (pipeline: `aws_lake`), data is automatically normalized to OCSF format before reaching the target. The pack handles all intermediate transformations, including CEF/LEEF to CommonSecurityLog, ECS to ASIM, and ASIM to OCSF conversions. You only need to define the buckets with their corresponding OCSF schema identifiers in the target configuration. + +### OCSF Schema Identifiers + +AWS Security Lake uses OCSF schema classes to categorize security events. Common schema identifiers include: + +|Schema ID|Description|Event Types| +|---|---|---| +|`OCSF1001`|File Activity|File access, creation, deletion, modification| +|`OCSF1002`|Kernel Extension Activity|Kernel module operations| +|`OCSF1003`|Kernel Activity|System calls, kernel events| +|`OCSF1004`|Memory Activity|Memory allocation, access patterns| +|`OCSF1005`|Module Activity|Library loading, dynamic linking| +|`OCSF1006`|Scheduled Job Activity|Cron jobs, task scheduler| +|`OCSF1007`|Process Activity|Process creation, termination| +|`OCSF2001`|Security Finding|Vulnerability findings, security issues| +|`OCSF3001`|Account Change|User account modifications| +|`OCSF3002`|Authentication|Login, logout, authentication events| +|`OCSF3003`|Authorize Session|Session authorization, access control| +|`OCSF3004`|Entity Management|Identity and entity operations| +|`OCSF3005`|User Access Management|Permission changes, role assignments| +|`OCSF4001`|Network Activity|Network connections, traffic flows| +|`OCSF4002`|HTTP Activity|Web requests, API calls| +|`OCSF4003`|DNS Activity|DNS queries and responses| +|`OCSF4004`|DHCP Activity|DHCP lease operations| +|`OCSF4005`|RDP Activity|Remote desktop connections| +|`OCSF4006`|SMB Activity|File sharing, SMB sessions| +|`OCSF4007`|SSH Activity|SSH connections and commands| +|`OCSF4008`|FTP Activity|File transfer operations| +|`OCSF4009`|Email Activity|Email sending, receiving| +|`OCSF4010`|Network File Activity|Network file operations| +|`OCSF4011`|Email File Activity|Email attachment handling| +|`OCSF4012`|Email URL Activity|Links in emails| +|`OCSF5001`|Inventory Info|Asset inventory updates| +|`OCSF5002`|Config State|Configuration changes| +|`OCSF6001`|Web Resources Activity|Web resource access| +|`OCSF6002`|Application Lifecycle|App deployment, updates| +|`OCSF6003`|API Activity|API endpoint usage| +|`OCSF6004`|Web Resource Access Activity|Web content access| + +### File Management + +Files are rotated based on size (`max_size` parameter) or event count (`batch_size` parameter), whichever limit is reached first. Files are automatically uploaded to the correct Security Lake partition path based on the current date and configured source, region, and account parameters. + +All files are written in Parquet format with gzip compression as required by AWS Security Lake. The OCSF schema specified for each bucket determines the structure and field types within the Parquet files. + +### Templates + +The following template variables can be used in file names: + +|Variable|Description|Example| +|---|---|---| +|`{{.Year}}`|Current year|`2024`| +|`{{.Month}}`|Current month|`01`| +|`{{.Day}}`|Current day|`15`| +|`{{.Timestamp}}`|Current timestamp in nanoseconds|`1703688533123456789`| +|`{{.TargetName}}`|Target name|`security_logs`| +|`{{.TargetType}}`|Target type|`awssecuritylake`| +|`{{.Table}}`|Bucket name|`security-data`| + +### Multipart Upload + +Large files automatically use multipart upload protocol with configurable part size (`part_size` parameter). Default 5MB part size balances upload efficiency and memory usage for security data workloads. + +### Multiple Buckets + +The target requires multiple bucket configurations for different security data types (e.g., network logs, authentication logs, DNS queries, process events), enabling organized data classification and access control. Each bucket configuration must specify its corresponding OCSF schema identifier to ensure proper data routing. + +## Examples + +### Basic Multi-Bucket Configuration + +Configuration for multiple security event types: + +```yaml +targets: + - name: security_lake_multi + type: awssecuritylake + pipelines: + - aws_lake + properties: + key: "AKIAIOSFODNN7EXAMPLE" + secret: "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY" + region: "us-east-1" + source: "virtualmetric" + account: "123456789012" + buckets: + - bucket: "aws-security-data-lake-network" + name: "network-events-{{.Timestamp}}.parquet" + schema: "OCSF4001" + - bucket: "aws-security-data-lake-auth" + name: "auth-events-{{.Timestamp}}.parquet" + schema: "OCSF3002" + - bucket: "aws-security-data-lake-dns" + name: "dns-events-{{.Timestamp}}.parquet" + schema: "OCSF4003" +``` + +### Comprehensive Multi-Source Configuration + +Configuration for collecting multiple security data types with the VirtualMetric pack: + +```yaml +targets: + - name: security_lake_comprehensive + type: awssecuritylake + pipelines: + - aws_lake + - checkpoint + properties: + key: "AKIAIOSFODNN7EXAMPLE" + secret: "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY" + region: "us-east-1" + source: "virtualmetric-enterprise" + account: "123456789012" + buckets: + - bucket: "aws-security-data-lake-network" + name: "network-{{.Year}}{{.Month}}{{.Day}}-{{.Timestamp}}.parquet" + schema: "OCSF4001" + - bucket: "aws-security-data-lake-authentication" + name: "auth-{{.Year}}{{.Month}}{{.Day}}-{{.Timestamp}}.parquet" + schema: "OCSF3002" + - bucket: "aws-security-data-lake-dns" + name: "dns-{{.Year}}{{.Month}}{{.Day}}-{{.Timestamp}}.parquet" + schema: "OCSF4003" + - bucket: "aws-security-data-lake-process" + name: "process-{{.Year}}{{.Month}}{{.Day}}-{{.Timestamp}}.parquet" + schema: "OCSF1007" + timeout: 90 + part_size: 10 +``` + +### High Reliability Configuration + +Configuration with enhanced reliability settings for critical security data: + +```yaml +targets: + - name: critical_security_logs + type: awssecuritylake + pipelines: + - aws_lake + - checkpoint + properties: + key: "AKIAIOSFODNN7EXAMPLE" + secret: "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY" + region: "us-east-1" + source: "virtualmetric-critical" + account: "123456789012" + buckets: + - bucket: "aws-security-data-lake-findings" + name: "findings-{{.Timestamp}}.parquet" + schema: "OCSF2001" + - bucket: "aws-security-data-lake-network" + name: "network-{{.Timestamp}}.parquet" + schema: "OCSF4001" + timeout: 60 + part_size: 10 + batch_size: 50000 +``` + +### Windows Security Events + +Configuration for Windows-specific security logs: + +```yaml +targets: + - name: security_lake_windows + type: awssecuritylake + pipelines: + - aws_lake + properties: + key: "AKIAIOSFODNN7EXAMPLE" + secret: "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY" + region: "us-west-2" + source: "virtualmetric-windows" + account: "123456789012" + buckets: + - bucket: "aws-security-data-lake-authentication" + name: "windows-auth-{{.Timestamp}}.parquet" + schema: "OCSF3002" + - bucket: "aws-security-data-lake-process" + name: "windows-process-{{.Timestamp}}.parquet" + schema: "OCSF1007" + - bucket: "aws-security-data-lake-account" + name: "windows-account-{{.Timestamp}}.parquet" + schema: "OCSF3001" + max_size: 268435456 +``` + +### Network and HTTP Activity + +Configuration for network traffic and web activity monitoring: + +```yaml +targets: + - name: security_lake_network_http + type: awssecuritylake + pipelines: + - aws_lake + properties: + key: "AKIAIOSFODNN7EXAMPLE" + secret: "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY" + region: "eu-west-1" + source: "virtualmetric-network" + account: "123456789012" + buckets: + - bucket: "aws-security-data-lake-network" + name: "network-traffic-{{.Year}}/{{.Month}}/{{.Day}}/{{.Timestamp}}.parquet" + schema: "OCSF4001" + - bucket: "aws-security-data-lake-http" + name: "http-activity-{{.Year}}/{{.Month}}/{{.Day}}/{{.Timestamp}}.parquet" + schema: "OCSF4002" + - bucket: "aws-security-data-lake-dns" + name: "dns-queries-{{.Year}}/{{.Month}}/{{.Day}}/{{.Timestamp}}.parquet" + schema: "OCSF4003" +``` + +### Firewall Logs + +Configuration for firewall and network security device logs: + +```yaml +targets: + - name: security_lake_firewall + type: awssecuritylake + pipelines: + - aws_lake + - checkpoint + properties: + key: "AKIAIOSFODNN7EXAMPLE" + secret: "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY" + region: "us-east-1" + source: "virtualmetric-firewall" + account: "123456789012" + buckets: + - bucket: "aws-security-data-lake-network" + name: "firewall-{{.Timestamp}}.parquet" + schema: "OCSF4001" + - bucket: "aws-security-data-lake-findings" + name: "firewall-threats-{{.Timestamp}}.parquet" + schema: "OCSF2001" + timeout: 90 + max_size: 536870912 +``` + +### Debug Configuration + +Configuration with debugging enabled for testing: + +```yaml +targets: + - name: debug_security_lake + type: awssecuritylake + pipelines: + - aws_lake + properties: + key: "AKIAIOSFODNN7EXAMPLE" + secret: "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY" + region: "us-east-1" + source: "virtualmetric-test" + account: "123456789012" + buckets: + - bucket: "aws-security-data-lake-test" + name: "test-{{.Timestamp}}.parquet" + schema: "OCSF4001" + debug: + status: true + dont_send_logs: true +``` + +:::note +All configurations must include the `aws_lake` pipeline for automatic OCSF normalization and must define explicit bucket configurations with corresponding OCSF schema identifiers. The VirtualMetric AWS Security Lake Pack handles all data transformation automatically, routing events to the appropriate buckets based on their OCSF schema class. +::: \ No newline at end of file diff --git a/sidebars.ts b/sidebars.ts index 06723bea..b469246d 100644 --- a/sidebars.ts +++ b/sidebars.ts @@ -116,6 +116,7 @@ const sidebars: SidebarsConfig = { items: [ "configuration/targets/overview", "configuration/targets/aws-s3", + "configuration/targets/aws-security-lake", "configuration/targets/azure-blob-storage", "configuration/targets/azure-data-explorer", "configuration/targets/bigquery", From 2a09abb8e1545729b01e7df7f9243d2f10b50410 Mon Sep 17 00:00:00 2001 From: Yusuf Ozturk Date: Sun, 26 Oct 2025 11:12:56 +0100 Subject: [PATCH 09/13] Fix tags --- docs/configuration/targets/aws-s3.mdx | 2 +- docs/configuration/targets/aws-security-lake.mdx | 2 +- docs/configuration/targets/microsoft-sentinel-data-lake.mdx | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/configuration/targets/aws-s3.mdx b/docs/configuration/targets/aws-s3.mdx index 4fa7cd32..65b437e9 100644 --- a/docs/configuration/targets/aws-s3.mdx +++ b/docs/configuration/targets/aws-s3.mdx @@ -1,6 +1,6 @@ # AWS S3 -Amazon AWSCloud Storage +Amazon AWSLong Term Storage ## Synopsis diff --git a/docs/configuration/targets/aws-security-lake.mdx b/docs/configuration/targets/aws-security-lake.mdx index 38f018ad..5d9b23f7 100644 --- a/docs/configuration/targets/aws-security-lake.mdx +++ b/docs/configuration/targets/aws-security-lake.mdx @@ -1,6 +1,6 @@ # AWS Security Lake -Amazon AWSSecurity +Amazon AWSSecurity Lake ## Synopsis diff --git a/docs/configuration/targets/microsoft-sentinel-data-lake.mdx b/docs/configuration/targets/microsoft-sentinel-data-lake.mdx index 5fa1d6a5..7a1a2c5e 100644 --- a/docs/configuration/targets/microsoft-sentinel-data-lake.mdx +++ b/docs/configuration/targets/microsoft-sentinel-data-lake.mdx @@ -1,6 +1,6 @@ # Microsoft Sentinel data lake -Microsoft AzureSIEM +Microsoft AzureSecurity Lake ## Synopsis From 8e2dd3347e7a5f89cd02a2e220a4f5b55fc78ffc Mon Sep 17 00:00:00 2001 From: Yusuf Ozturk Date: Sun, 26 Oct 2025 11:23:58 +0100 Subject: [PATCH 10/13] Add new storage targets --- docs/configuration/targets/alibaba-oss.mdx | 354 ++++++++++++++++ docs/configuration/targets/backblaze-b2.mdx | 344 ++++++++++++++++ docs/configuration/targets/cloudflare-r2.mdx | 308 ++++++++++++++ .../targets/digitalocean-spaces.mdx | 325 +++++++++++++++ docs/configuration/targets/ibm-cos.mdx | 321 +++++++++++++++ docs/configuration/targets/minio.mdx | 324 +++++++++++++++ .../configuration/targets/oracle-cloud-os.mdx | 342 ++++++++++++++++ docs/configuration/targets/scaleway-os.mdx | 346 ++++++++++++++++ .../targets/wasabi-cloud-storage.mdx | 382 ++++++++++++++++++ sidebars.ts | 9 + 10 files changed, 3055 insertions(+) create mode 100644 docs/configuration/targets/alibaba-oss.mdx create mode 100644 docs/configuration/targets/backblaze-b2.mdx create mode 100644 docs/configuration/targets/cloudflare-r2.mdx create mode 100644 docs/configuration/targets/digitalocean-spaces.mdx create mode 100644 docs/configuration/targets/ibm-cos.mdx create mode 100644 docs/configuration/targets/minio.mdx create mode 100644 docs/configuration/targets/oracle-cloud-os.mdx create mode 100644 docs/configuration/targets/scaleway-os.mdx create mode 100644 docs/configuration/targets/wasabi-cloud-storage.mdx diff --git a/docs/configuration/targets/alibaba-oss.mdx b/docs/configuration/targets/alibaba-oss.mdx new file mode 100644 index 00000000..984252bf --- /dev/null +++ b/docs/configuration/targets/alibaba-oss.mdx @@ -0,0 +1,354 @@ +# Alibaba Cloud OSS + +Alibaba CloudLong Term Storage + +## Synopsis + +Creates a target that writes log messages to Alibaba Cloud Object Storage Service (OSS) with support for various file formats and authentication methods. The target handles large file uploads efficiently with configurable rotation based on size or event count. Alibaba Cloud OSS provides secure, cost-effective, and high-durability object storage with strong presence in Asia-Pacific regions. + +## Schema + +```yaml {1,3} +- name: + description: + type: alibabas3 + pipelines: + status: + properties: + key: + secret: + region: + endpoint: + part_size: + bucket: + buckets: + - bucket: + name: + format: + compression: + extension: + schema: + name: + format: + compression: + extension: + schema: + max_size: + batch_size: + timeout: + field_format: + interval: + cron: + debug: + status: + dont_send_logs: +``` + +## Configuration + +The following fields are used to define the target: + +|Field|Required|Default|Description| +|---|---|---|---| +|`name`|Y||Target name| +|`description`|N|-|Optional description| +|`type`|Y||Must be `alibabas3`| +|`pipelines`|N|-|Optional post-processor pipelines| +|`status`|N|`true`|Enable/disable the target| + +### Alibaba Cloud OSS Credentials + +|Field|Required|Default|Description| +|---|---|---|---| +|`key`|Y|-|Alibaba Cloud OSS access key ID| +|`secret`|Y|-|Alibaba Cloud OSS access key secret| +|`region`|Y|-|Alibaba Cloud region (e.g., `oss-cn-hangzhou`, `oss-us-west-1`, `oss-ap-southeast-1`)| +|`endpoint`|Y|-|OSS endpoint URL (format: `https://oss-.aliyuncs.com`)| + +### Connection + +|Field|Required|Default|Description| +|---|---|---|---| +|`part_size`|N|`5`|Multipart upload part size in megabytes (minimum 5MB)| +|`timeout`|N|`30`|Connection timeout in seconds| +|`field_format`|N|-|Data normalization format. See applicable Normalization section| + +### Files + +|Field|Required|Default|Description| +|---|---|---|---| +|`bucket`|N*|-|Default OSS bucket name (used if `buckets` not specified)| +|`buckets`|N*|-|Array of bucket configurations for file distribution| +|`buckets.bucket`|Y|-|OSS bucket name| +|`buckets.name`|Y|-|File name template| +|`buckets.format`|N|`"json"`|Output format: `json`, `multijson`, `avro`, `parquet`| +|`buckets.compression`|N|`"zstd"`|Compression algorithm| +|`buckets.extension`|N|Matches `format`|File extension override| +|`buckets.schema`|N*|-|Schema definition file path (required for Avro and Parquet formats)| +|`name`|N|`"vmetric.{{.Timestamp}}.{{.Extension}}"`|Default file name template when `buckets` not used| +|`format`|N|`"json"`|Default output format when `buckets` not used| +|`compression`|N|`"zstd"`|Default compression when `buckets` not used| +|`extension`|N|Matches `format`|Default file extension when `buckets` not used| +|`schema`|N|-|Default schema path when `buckets` not used| +|`max_size`|N|`0`|Maximum file size in bytes before rotation| +|`batch_size`|N|`100000`|Maximum number of messages per file| + +\* = Either `bucket` or `buckets` must be specified. When using `buckets`, schema is conditionally required for Avro and Parquet formats. + +:::note +When `max_size` is reached, the current file is uploaded to OSS and a new file is created. For unlimited file size, set the field to `0`. +::: + +### Scheduler + +|Field|Required|Default|Description| +|---|---|---|---| +|`interval`|N|realtime|Execution frequency. See Interval for details| +|`cron`|N|-|Cron expression for scheduled execution. See Cron for details| + +### Debug Options + +|Field|Required|Default|Description| +|---|---|---|---| +|`debug.status`|N|`false`|Enable debug logging| +|`debug.dont_send_logs`|N|`false`|Process logs but don't send to target (testing)| + +## Details + +The Alibaba Cloud OSS target provides enterprise-grade cloud storage integration with comprehensive file format support. OSS offers 99.9999999999% (12 nines) data durability and strong regional coverage across Asia-Pacific, making it ideal for applications serving Asian markets. + +### Authentication + +Requires Alibaba Cloud access credentials. Access keys can be created through the Alibaba Cloud Console under AccessKey Management. RAM (Resource Access Management) users can be created with specific OSS permissions for enhanced security. + +### Endpoint Configuration + +The endpoint URL follows the pattern `https://oss-.aliyuncs.com` where `` is your chosen Alibaba Cloud region identifier. Internal endpoints are also available for ECS instances in the same region using `https://oss--internal.aliyuncs.com` for cost savings. + +### Available Regions + +Alibaba Cloud OSS is available in numerous regions worldwide: + +|Region Code|Location| +|---|---| +|`oss-cn-hangzhou`|China (Hangzhou)| +|`oss-cn-shanghai`|China (Shanghai)| +|`oss-cn-beijing`|China (Beijing)| +|`oss-cn-shenzhen`|China (Shenzhen)| +|`oss-cn-hongkong`|China (Hong Kong)| +|`oss-us-west-1`|US (Silicon Valley)| +|`oss-us-east-1`|US (Virginia)| +|`oss-ap-southeast-1`|Singapore| +|`oss-ap-southeast-2`|Australia (Sydney)| +|`oss-ap-southeast-3`|Malaysia (Kuala Lumpur)| +|`oss-ap-southeast-5`|Indonesia (Jakarta)| +|`oss-ap-northeast-1`|Japan (Tokyo)| +|`oss-ap-south-1`|India (Mumbai)| +|`oss-eu-central-1`|Germany (Frankfurt)| +|`oss-eu-west-1`|UK (London)| +|`oss-me-east-1`|UAE (Dubai)| + +### File Formats + +|Format|Description| +|---|---| +|`json`|Each log entry is written as a separate JSON line (JSONL format)| +|`multijson`|All log entries are written as a single JSON array| +|`avro`|Apache Avro format with schema| +|`parquet`|Apache Parquet columnar format with schema| + +### Compression + +All formats support optional compression to reduce storage costs and transfer times. Compression is applied before upload. + +|Format|Compression Options| +|---|---| +|JSON/MultiJSON|`zstd` (default), `gzip`| +|Avro|`null`, `deflate`, `snappy`, `zstd`| +|Parquet|`uncompressed`, `gzip`, `snappy`, `zstd`, `brotli`, `lz4`| + +### File Management + +Files are rotated based on size (`max_size` parameter) or event count (`batch_size` parameter), whichever limit is reached first. Template variables in file names enable dynamic file naming for time-based partitioning. + +### Templates + +The following template variables can be used in file names: + +|Variable|Description|Example| +|---|---|---| +|`{{.Year}}`|Current year|`2024`| +|`{{.Month}}`|Current month|`01`| +|`{{.Day}}`|Current day|`15`| +|`{{.Timestamp}}`|Current timestamp in nanoseconds|`1703688533123456789`| +|`{{.Format}}`|File format|`json`| +|`{{.Extension}}`|File extension|`json`| +|`{{.Compression}}`|Compression type|`zstd`| +|`{{.TargetName}}`|Target name|`my_logs`| +|`{{.TargetType}}`|Target type|`alibabas3`| +|`{{.Table}}`|Bucket name|`logs`| + +### Multipart Upload + +Large files automatically use multipart upload protocol with configurable part size (`part_size` parameter). Default 5MB part size balances upload efficiency and memory usage. + +### Multiple Buckets + +Single target can write to multiple OSS buckets with different configurations, enabling data distribution strategies (e.g., raw data to one bucket, processed data to another). + +### Schema Requirements + +Avro and Parquet formats require schema definition files. Schema files must be accessible at the path specified in the `schema` parameter during target initialization. + +### Storage Classes + +Alibaba Cloud OSS supports multiple storage classes including Standard, Infrequent Access, Archive, and Cold Archive for cost optimization based on access patterns. + +### Regional Performance + +OSS provides excellent performance for applications serving Asian markets with extensive regional presence across China, Southeast Asia, and other Asia-Pacific regions. + +## Examples + +### Basic Configuration + +The minimum configuration for a JSON OSS target: + +```yaml +targets: + - name: basic_oss + type: alibabas3 + properties: + key: "LTAI5tAbCdEfGhIjKlMnOpQr" + secret: "aBcDeFgHiJkLmNoPqRsTuVwXyZ0123456789" + region: "oss-cn-hangzhou" + endpoint: "https://oss-cn-hangzhou.aliyuncs.com" + bucket: "datastream-logs" +``` + +### Multiple Buckets + +Configuration for distributing data across multiple OSS buckets with different formats: + +```yaml +targets: + - name: multi_bucket_export + type: alibabas3 + properties: + key: "LTAI5tAbCdEfGhIjKlMnOpQr" + secret: "aBcDeFgHiJkLmNoPqRsTuVwXyZ0123456789" + region: "oss-ap-southeast-1" + endpoint: "https://oss-ap-southeast-1.aliyuncs.com" + buckets: + - bucket: "raw-data-archive" + name: "raw-{{.Year}}-{{.Month}}-{{.Day}}.json" + format: "multijson" + compression: "gzip" + - bucket: "analytics-data" + name: "analytics-{{.Year}}/{{.Month}}/{{.Day}}/data_{{.Timestamp}}.parquet" + format: "parquet" + schema: "" + compression: "snappy" +``` + +### Parquet Format + +Configuration for daily partitioned Parquet files: + +```yaml +targets: + - name: parquet_analytics + type: alibabas3 + properties: + key: "LTAI5tAbCdEfGhIjKlMnOpQr" + secret: "aBcDeFgHiJkLmNoPqRsTuVwXyZ0123456789" + region: "oss-cn-shanghai" + endpoint: "https://oss-cn-shanghai.aliyuncs.com" + bucket: "analytics-lake" + name: "events/year={{.Year}}/month={{.Month}}/day={{.Day}}/part-{{.Timestamp}}.parquet" + format: "parquet" + schema: "" + compression: "snappy" + max_size: 536870912 +``` + +### High Reliability + +Configuration with enhanced settings: + +```yaml +targets: + - name: reliable_oss + type: alibabas3 + pipelines: + - checkpoint + properties: + key: "LTAI5tAbCdEfGhIjKlMnOpQr" + secret: "aBcDeFgHiJkLmNoPqRsTuVwXyZ0123456789" + region: "oss-cn-beijing" + endpoint: "https://oss-cn-beijing.aliyuncs.com" + bucket: "critical-logs" + name: "logs-{{.Timestamp}}.json" + format: "json" + timeout: 60 + part_size: 10 +``` + +### With Field Normalization + +Using field normalization for standard format: + +```yaml +targets: + - name: normalized_oss + type: alibabas3 + properties: + key: "LTAI5tAbCdEfGhIjKlMnOpQr" + secret: "aBcDeFgHiJkLmNoPqRsTuVwXyZ0123456789" + region: "oss-ap-northeast-1" + endpoint: "https://oss-ap-northeast-1.aliyuncs.com" + bucket: "normalized-logs" + name: "logs-{{.Timestamp}}.json" + format: "json" + field_format: "cim" +``` + +### Debug Configuration + +Configuration with debugging enabled: + +```yaml +targets: + - name: debug_oss + type: alibabas3 + properties: + key: "LTAI5tAbCdEfGhIjKlMnOpQr" + secret: "aBcDeFgHiJkLmNoPqRsTuVwXyZ0123456789" + region: "oss-cn-hangzhou" + endpoint: "https://oss-cn-hangzhou.aliyuncs.com" + bucket: "test-logs" + name: "test-{{.Timestamp}}.json" + format: "json" + debug: + status: true + dont_send_logs: true +``` + +### Internal Endpoint + +Configuration using internal endpoint for ECS instances in the same region: + +```yaml +targets: + - name: internal_oss + type: alibabas3 + properties: + key: "LTAI5tAbCdEfGhIjKlMnOpQr" + secret: "aBcDeFgHiJkLmNoPqRsTuVwXyZ0123456789" + region: "oss-cn-shanghai" + endpoint: "https://oss-cn-shanghai-internal.aliyuncs.com" + bucket: "application-logs" + name: "logs/{{.Year}}/{{.Month}}/{{.Day}}/{{.Timestamp}}.json" + format: "json" + compression: "zstd" +``` \ No newline at end of file diff --git a/docs/configuration/targets/backblaze-b2.mdx b/docs/configuration/targets/backblaze-b2.mdx new file mode 100644 index 00000000..078aba23 --- /dev/null +++ b/docs/configuration/targets/backblaze-b2.mdx @@ -0,0 +1,344 @@ +# Backblaze B2 Cloud Storage + +BackblazeLong Term Storage + +## Synopsis + +Creates a target that writes log messages to Backblaze B2 Cloud Storage with support for various file formats and authentication methods. The target handles large file uploads efficiently with configurable rotation based on size or event count. Backblaze B2 provides cost-effective, reliable cloud storage with simple pricing and no egress fees for many use cases. + +## Schema + +```yaml {1,3} +- name: + description: + type: backblazes3 + pipelines: + status: + properties: + key: + secret: + region: + endpoint: + part_size: + bucket: + buckets: + - bucket: + name: + format: + compression: + extension: + schema: + name: + format: + compression: + extension: + schema: + max_size: + batch_size: + timeout: + field_format: + interval: + cron: + debug: + status: + dont_send_logs: +``` + +## Configuration + +The following fields are used to define the target: + +|Field|Required|Default|Description| +|---|---|---|---| +|`name`|Y||Target name| +|`description`|N|-|Optional description| +|`type`|Y||Must be `backblazes3`| +|`pipelines`|N|-|Optional post-processor pipelines| +|`status`|N|`true`|Enable/disable the target| + +### Backblaze B2 Credentials + +|Field|Required|Default|Description| +|---|---|---|---| +|`key`|Y|-|Backblaze B2 application key ID| +|`secret`|Y|-|Backblaze B2 application key| +|`region`|Y|-|B2 region (e.g., `us-west-004`, `eu-central-003`)| +|`endpoint`|Y|-|B2 S3-compatible endpoint (format: `https://s3..backblazeb2.com`)| + +### Connection + +|Field|Required|Default|Description| +|---|---|---|---| +|`part_size`|N|`5`|Multipart upload part size in megabytes (minimum 5MB)| +|`timeout`|N|`30`|Connection timeout in seconds| +|`field_format`|N|-|Data normalization format. See applicable Normalization section| + +### Files + +|Field|Required|Default|Description| +|---|---|---|---| +|`bucket`|N*|-|Default B2 bucket name (used if `buckets` not specified)| +|`buckets`|N*|-|Array of bucket configurations for file distribution| +|`buckets.bucket`|Y|-|B2 bucket name| +|`buckets.name`|Y|-|File name template| +|`buckets.format`|N|`"json"`|Output format: `json`, `multijson`, `avro`, `parquet`| +|`buckets.compression`|N|`"zstd"`|Compression algorithm| +|`buckets.extension`|N|Matches `format`|File extension override| +|`buckets.schema`|N*|-|Schema definition file path (required for Avro and Parquet formats)| +|`name`|N|`"vmetric.{{.Timestamp}}.{{.Extension}}"`|Default file name template when `buckets` not used| +|`format`|N|`"json"`|Default output format when `buckets` not used| +|`compression`|N|`"zstd"`|Default compression when `buckets` not used| +|`extension`|N|Matches `format`|Default file extension when `buckets` not used| +|`schema`|N|-|Default schema path when `buckets` not used| +|`max_size`|N|`0`|Maximum file size in bytes before rotation| +|`batch_size`|N|`100000`|Maximum number of messages per file| + +\* = Either `bucket` or `buckets` must be specified. When using `buckets`, schema is conditionally required for Avro and Parquet formats. + +:::note +When `max_size` is reached, the current file is uploaded to B2 and a new file is created. For unlimited file size, set the field to `0`. +::: + +### Scheduler + +|Field|Required|Default|Description| +|---|---|---|---| +|`interval`|N|realtime|Execution frequency. See Interval for details| +|`cron`|N|-|Cron expression for scheduled execution. See Cron for details| + +### Debug Options + +|Field|Required|Default|Description| +|---|---|---|---| +|`debug.status`|N|`false`|Enable debug logging| +|`debug.dont_send_logs`|N|`false`|Process logs but don't send to target (testing)| + +## Details + +The Backblaze B2 target provides cost-effective cloud storage integration with comprehensive file format support. B2 is known for its transparent, simple pricing model with storage at a fraction of the cost of major cloud providers and free egress through Bandwidth Alliance partners. + +### Authentication + +Requires Backblaze B2 application keys. Application keys can be created through the Backblaze web interface under App Keys. Keys can be scoped to specific buckets and operations for enhanced security. + +### Endpoint Configuration + +The endpoint URL follows the pattern `https://s3..backblazeb2.com` where `` is your B2 region identifier. Each bucket is associated with a specific region during creation. + +### Available Regions + +Backblaze B2 is available in the following regions: + +|Region Code|Location| +|---|---| +|`us-west-001`|US West (Sacramento)| +|`us-west-002`|US West (Sacramento)| +|`us-west-004`|US West (Phoenix)| +|`us-east-005`|US East (Miami)| +|`eu-central-003`|Europe (Amsterdam)| + +### File Formats + +|Format|Description| +|---|---| +|`json`|Each log entry is written as a separate JSON line (JSONL format)| +|`multijson`|All log entries are written as a single JSON array| +|`avro`|Apache Avro format with schema| +|`parquet`|Apache Parquet columnar format with schema| + +### Compression + +All formats support optional compression to reduce storage costs and transfer times. Compression is applied before upload. + +|Format|Compression Options| +|---|---| +|JSON/MultiJSON|`zstd` (default), `gzip`| +|Avro|`null`, `deflate`, `snappy`, `zstd`| +|Parquet|`uncompressed`, `gzip`, `snappy`, `zstd`, `brotli`, `lz4`| + +### File Management + +Files are rotated based on size (`max_size` parameter) or event count (`batch_size` parameter), whichever limit is reached first. Template variables in file names enable dynamic file naming for time-based partitioning. + +### Templates + +The following template variables can be used in file names: + +|Variable|Description|Example| +|---|---|---| +|`{{.Year}}`|Current year|`2024`| +|`{{.Month}}`|Current month|`01`| +|`{{.Day}}`|Current day|`15`| +|`{{.Timestamp}}`|Current timestamp in nanoseconds|`1703688533123456789`| +|`{{.Format}}`|File format|`json`| +|`{{.Extension}}`|File extension|`json`| +|`{{.Compression}}`|Compression type|`zstd`| +|`{{.TargetName}}`|Target name|`my_logs`| +|`{{.TargetType}}`|Target type|`backblazes3`| +|`{{.Table}}`|Bucket name|`logs`| + +### Multipart Upload + +Large files automatically use multipart upload protocol with configurable part size (`part_size` parameter). Default 5MB part size balances upload efficiency and memory usage. + +### Multiple Buckets + +Single target can write to multiple B2 buckets with different configurations, enabling data distribution strategies (e.g., raw data to one bucket, processed data to another). + +### Schema Requirements + +Avro and Parquet formats require schema definition files. Schema files must be accessible at the path specified in the `schema` parameter during target initialization. + +### Cost Advantages + +Backblaze B2 offers highly competitive pricing with storage costs significantly lower than AWS S3, Azure, or Google Cloud. The Bandwidth Alliance provides free egress when downloading to Cloudflare and other partner networks. + +### Pricing Model + +B2 uses a simple pricing structure with no hidden fees, no API request charges, and the first 10GB of storage free. Download fees apply only when exceeding daily free allowances. + +## Examples + +### Basic Configuration + +The minimum configuration for a JSON B2 target: + +```yaml +targets: + - name: basic_b2 + type: backblazes3 + properties: + key: "0012a3b4c5d6e7f8901234" + secret: "K001abcdefghijklmnopqrstuvwxyz0123456789" + region: "us-west-004" + endpoint: "https://s3.us-west-004.backblazeb2.com" + bucket: "datastream-logs" +``` + +### Multiple Buckets + +Configuration for distributing data across multiple B2 buckets with different formats: + +```yaml +targets: + - name: multi_bucket_export + type: backblazes3 + properties: + key: "0012a3b4c5d6e7f8901234" + secret: "K001abcdefghijklmnopqrstuvwxyz0123456789" + region: "eu-central-003" + endpoint: "https://s3.eu-central-003.backblazeb2.com" + buckets: + - bucket: "raw-data-archive" + name: "raw-{{.Year}}-{{.Month}}-{{.Day}}.json" + format: "multijson" + compression: "gzip" + - bucket: "analytics-data" + name: "analytics-{{.Year}}/{{.Month}}/{{.Day}}/data_{{.Timestamp}}.parquet" + format: "parquet" + schema: "" + compression: "snappy" +``` + +### Parquet Format + +Configuration for daily partitioned Parquet files: + +```yaml +targets: + - name: parquet_analytics + type: backblazes3 + properties: + key: "0012a3b4c5d6e7f8901234" + secret: "K001abcdefghijklmnopqrstuvwxyz0123456789" + region: "us-west-002" + endpoint: "https://s3.us-west-002.backblazeb2.com" + bucket: "analytics-lake" + name: "events/year={{.Year}}/month={{.Month}}/day={{.Day}}/part-{{.Timestamp}}.parquet" + format: "parquet" + schema: "" + compression: "snappy" + max_size: 536870912 +``` + +### High Reliability + +Configuration with enhanced settings: + +```yaml +targets: + - name: reliable_b2 + type: backblazes3 + pipelines: + - checkpoint + properties: + key: "0012a3b4c5d6e7f8901234" + secret: "K001abcdefghijklmnopqrstuvwxyz0123456789" + region: "us-east-005" + endpoint: "https://s3.us-east-005.backblazeb2.com" + bucket: "critical-logs" + name: "logs-{{.Timestamp}}.json" + format: "json" + timeout: 60 + part_size: 10 +``` + +### With Field Normalization + +Using field normalization for standard format: + +```yaml +targets: + - name: normalized_b2 + type: backblazes3 + properties: + key: "0012a3b4c5d6e7f8901234" + secret: "K001abcdefghijklmnopqrstuvwxyz0123456789" + region: "us-west-004" + endpoint: "https://s3.us-west-004.backblazeb2.com" + bucket: "normalized-logs" + name: "logs-{{.Timestamp}}.json" + format: "json" + field_format: "cim" +``` + +### Debug Configuration + +Configuration with debugging enabled: + +```yaml +targets: + - name: debug_b2 + type: backblazes3 + properties: + key: "0012a3b4c5d6e7f8901234" + secret: "K001abcdefghijklmnopqrstuvwxyz0123456789" + region: "us-west-001" + endpoint: "https://s3.us-west-001.backblazeb2.com" + bucket: "test-logs" + name: "test-{{.Timestamp}}.json" + format: "json" + debug: + status: true + dont_send_logs: true +``` + +### Cost-Optimized Archive + +Configuration optimized for long-term storage with high compression: + +```yaml +targets: + - name: archive_b2 + type: backblazes3 + properties: + key: "0012a3b4c5d6e7f8901234" + secret: "K001abcdefghijklmnopqrstuvwxyz0123456789" + region: "eu-central-003" + endpoint: "https://s3.eu-central-003.backblazeb2.com" + bucket: "log-archive" + name: "archive/{{.Year}}/{{.Month}}/logs-{{.Day}}.json" + format: "json" + compression: "zstd" + max_size: 1073741824 +``` \ No newline at end of file diff --git a/docs/configuration/targets/cloudflare-r2.mdx b/docs/configuration/targets/cloudflare-r2.mdx new file mode 100644 index 00000000..23b6fcbb --- /dev/null +++ b/docs/configuration/targets/cloudflare-r2.mdx @@ -0,0 +1,308 @@ +# Cloudflare R2 + +CloudflareLong Term Storage + +## Synopsis + +Creates a target that writes log messages to Cloudflare R2 buckets with support for various file formats and authentication methods. The target handles large file uploads efficiently with configurable rotation based on size or event count. Cloudflare R2 provides zero egress fees and S3-compatible object storage. + +## Schema + +```yaml {1,3} +- name: + description: + type: cloudflarer2 + pipelines: + status: + properties: + key: + secret: + region: + endpoint: + part_size: + bucket: + buckets: + - bucket: + name: + format: + compression: + extension: + schema: + name: + format: + compression: + extension: + schema: + max_size: + batch_size: + timeout: + field_format: + interval: + cron: + debug: + status: + dont_send_logs: +``` + +## Configuration + +The following fields are used to define the target: + +|Field|Required|Default|Description| +|---|---|---|---| +|`name`|Y||Target name| +|`description`|N|-|Optional description| +|`type`|Y||Must be `cloudflarer2`| +|`pipelines`|N|-|Optional post-processor pipelines| +|`status`|N|`true`|Enable/disable the target| + +### Cloudflare R2 Credentials + +|Field|Required|Default|Description| +|---|---|---|---| +|`key`|Y|-|Cloudflare R2 access key ID| +|`secret`|Y|-|Cloudflare R2 secret access key| +|`region`|N|`auto`|R2 region (typically `auto` for automatic region selection)| +|`endpoint`|Y|-|R2 endpoint URL (format: `https://.r2.cloudflarestorage.com`)| + +### Connection + +|Field|Required|Default|Description| +|---|---|---|---| +|`part_size`|N|`5`|Multipart upload part size in megabytes (minimum 5MB)| +|`timeout`|N|`30`|Connection timeout in seconds| +|`field_format`|N|-|Data normalization format. See applicable Normalization section| + +### Files + +|Field|Required|Default|Description| +|---|---|---|---| +|`bucket`|N*|-|Default R2 bucket name (used if `buckets` not specified)| +|`buckets`|N*|-|Array of bucket configurations for file distribution| +|`buckets.bucket`|Y|-|R2 bucket name| +|`buckets.name`|Y|-|File name template| +|`buckets.format`|N|`"json"`|Output format: `json`, `multijson`, `avro`, `parquet`| +|`buckets.compression`|N|`"zstd"`|Compression algorithm| +|`buckets.extension`|N|Matches `format`|File extension override| +|`buckets.schema`|N*|-|Schema definition file path (required for Avro and Parquet formats)| +|`name`|N|`"vmetric.{{.Timestamp}}.{{.Extension}}"`|Default file name template when `buckets` not used| +|`format`|N|`"json"`|Default output format when `buckets` not used| +|`compression`|N|`"zstd"`|Default compression when `buckets` not used| +|`extension`|N|Matches `format`|Default file extension when `buckets` not used| +|`schema`|N|-|Default schema path when `buckets` not used| +|`max_size`|N|`0`|Maximum file size in bytes before rotation| +|`batch_size`|N|`100000`|Maximum number of messages per file| + +\* = Either `bucket` or `buckets` must be specified. When using `buckets`, schema is conditionally required for Avro and Parquet formats. + +:::note +When `max_size` is reached, the current file is uploaded to R2 and a new file is created. For unlimited file size, set the field to `0`. +::: + +### Scheduler + +|Field|Required|Default|Description| +|---|---|---|---| +|`interval`|N|realtime|Execution frequency. See Interval for details| +|`cron`|N|-|Cron expression for scheduled execution. See Cron for details| + +### Debug Options + +|Field|Required|Default|Description| +|---|---|---|---| +|`debug.status`|N|`false`|Enable debug logging| +|`debug.dont_send_logs`|N|`false`|Process logs but don't send to target (testing)| + +## Details + +The Cloudflare R2 target provides enterprise-grade cloud storage integration with zero egress fees and comprehensive file format support. R2 is Cloudflare's object storage service designed for high-performance data storage with global accessibility. + +### Authentication + +Requires R2 access credentials obtained from the Cloudflare dashboard. Access keys are scoped to specific accounts and can be restricted to individual buckets for enhanced security. + +### Endpoint Configuration + +The endpoint URL follows the pattern `https://.r2.cloudflarestorage.com` where `` is your Cloudflare account identifier found in the R2 dashboard. + +### File Formats + +|Format|Description| +|---|---| +|`json`|Each log entry is written as a separate JSON line (JSONL format)| +|`multijson`|All log entries are written as a single JSON array| +|`avro`|Apache Avro format with schema| +|`parquet`|Apache Parquet columnar format with schema| + +### Compression + +All formats support optional compression to reduce storage costs and transfer times. Compression is applied before upload. + +|Format|Compression Options| +|---|---| +|JSON/MultiJSON|`zstd` (default), `gzip`| +|Avro|`null`, `deflate`, `snappy`, `zstd`| +|Parquet|`uncompressed`, `gzip`, `snappy`, `zstd`, `brotli`, `lz4`| + +### File Management + +Files are rotated based on size (`max_size` parameter) or event count (`batch_size` parameter), whichever limit is reached first. Template variables in file names enable dynamic file naming for time-based partitioning. + +### Templates + +The following template variables can be used in file names: + +|Variable|Description|Example| +|---|---|---| +|`{{.Year}}`|Current year|`2024`| +|`{{.Month}}`|Current month|`01`| +|`{{.Day}}`|Current day|`15`| +|`{{.Timestamp}}`|Current timestamp in nanoseconds|`1703688533123456789`| +|`{{.Format}}`|File format|`json`| +|`{{.Extension}}`|File extension|`json`| +|`{{.Compression}}`|Compression type|`zstd`| +|`{{.TargetName}}`|Target name|`my_logs`| +|`{{.TargetType}}`|Target type|`cloudflarer2`| +|`{{.Table}}`|Bucket name|`logs`| + +### Multipart Upload + +Large files automatically use multipart upload protocol with configurable part size (`part_size` parameter). Default 5MB part size balances upload efficiency and memory usage. + +### Multiple Buckets + +Single target can write to multiple R2 buckets with different configurations, enabling data distribution strategies (e.g., raw data to one bucket, processed data to another). + +### Schema Requirements + +Avro and Parquet formats require schema definition files. Schema files must be accessible at the path specified in the `schema` parameter during target initialization. + +### Cost Advantages + +Cloudflare R2 provides zero egress fees, making it cost-effective for frequently accessed data and analytics workloads that require regular data retrieval. + +## Examples + +### Basic Configuration + +The minimum configuration for a JSON R2 target: + +```yaml +targets: + - name: basic_r2 + type: cloudflarer2 + properties: + key: "4f3e2a1b0c9d8e7f6a5b4c3d2e1f0a9b" + secret: "9b8a7c6d5e4f3a2b1c0d9e8f7a6b5c4d3e2f1a0b" + endpoint: "https://abc123def456.r2.cloudflarestorage.com" + region: "auto" + bucket: "datastream-logs" +``` + +### Multiple Buckets + +Configuration for distributing data across multiple R2 buckets with different formats: + +```yaml +targets: + - name: multi_bucket_export + type: cloudflarer2 + properties: + key: "4f3e2a1b0c9d8e7f6a5b4c3d2e1f0a9b" + secret: "9b8a7c6d5e4f3a2b1c0d9e8f7a6b5c4d3e2f1a0b" + endpoint: "https://abc123def456.r2.cloudflarestorage.com" + region: "auto" + buckets: + - bucket: "raw-data-archive" + name: "raw-{{.Year}}-{{.Month}}-{{.Day}}.json" + format: "multijson" + compression: "gzip" + - bucket: "analytics-data" + name: "analytics-{{.Year}}/{{.Month}}/{{.Day}}/data_{{.Timestamp}}.parquet" + format: "parquet" + schema: "" + compression: "snappy" +``` + +### Parquet Format + +Configuration for daily partitioned Parquet files: + +```yaml +targets: + - name: parquet_analytics + type: cloudflarer2 + properties: + key: "4f3e2a1b0c9d8e7f6a5b4c3d2e1f0a9b" + secret: "9b8a7c6d5e4f3a2b1c0d9e8f7a6b5c4d3e2f1a0b" + endpoint: "https://abc123def456.r2.cloudflarestorage.com" + region: "auto" + bucket: "analytics-lake" + name: "events/year={{.Year}}/month={{.Month}}/day={{.Day}}/part-{{.Timestamp}}.parquet" + format: "parquet" + schema: "" + compression: "snappy" + max_size: 536870912 +``` + +### High Reliability + +Configuration with enhanced settings: + +```yaml +targets: + - name: reliable_r2 + type: cloudflarer2 + pipelines: + - checkpoint + properties: + key: "4f3e2a1b0c9d8e7f6a5b4c3d2e1f0a9b" + secret: "9b8a7c6d5e4f3a2b1c0d9e8f7a6b5c4d3e2f1a0b" + endpoint: "https://abc123def456.r2.cloudflarestorage.com" + region: "auto" + bucket: "critical-logs" + name: "logs-{{.Timestamp}}.json" + format: "json" + timeout: 60 + part_size: 10 +``` + +### With Field Normalization + +Using field normalization for standard format: + +```yaml +targets: + - name: normalized_r2 + type: cloudflarer2 + properties: + key: "4f3e2a1b0c9d8e7f6a5b4c3d2e1f0a9b" + secret: "9b8a7c6d5e4f3a2b1c0d9e8f7a6b5c4d3e2f1a0b" + endpoint: "https://abc123def456.r2.cloudflarestorage.com" + region: "auto" + bucket: "normalized-logs" + name: "logs-{{.Timestamp}}.json" + format: "json" + field_format: "cim" +``` + +### Debug Configuration + +Configuration with debugging enabled: + +```yaml +targets: + - name: debug_r2 + type: cloudflarer2 + properties: + key: "4f3e2a1b0c9d8e7f6a5b4c3d2e1f0a9b" + secret: "9b8a7c6d5e4f3a2b1c0d9e8f7a6b5c4d3e2f1a0b" + endpoint: "https://abc123def456.r2.cloudflarestorage.com" + region: "auto" + bucket: "test-logs" + name: "test-{{.Timestamp}}.json" + format: "json" + debug: + status: true + dont_send_logs: true +``` \ No newline at end of file diff --git a/docs/configuration/targets/digitalocean-spaces.mdx b/docs/configuration/targets/digitalocean-spaces.mdx new file mode 100644 index 00000000..a2ed3041 --- /dev/null +++ b/docs/configuration/targets/digitalocean-spaces.mdx @@ -0,0 +1,325 @@ +# DigitalOcean Spaces + +DigitalOceanLong Term Storage + +## Synopsis + +Creates a target that writes log messages to DigitalOcean Spaces with support for various file formats and authentication methods. The target handles large file uploads efficiently with configurable rotation based on size or event count. DigitalOcean Spaces provides simple, scalable object storage with built-in CDN integration. + +## Schema + +```yaml {1,3} +- name: + description: + type: digitaloceans3 + pipelines: + status: + properties: + key: + secret: + region: + endpoint: + part_size: + bucket: + buckets: + - bucket: + name: + format: + compression: + extension: + schema: + name: + format: + compression: + extension: + schema: + max_size: + batch_size: + timeout: + field_format: + interval: + cron: + debug: + status: + dont_send_logs: +``` + +## Configuration + +The following fields are used to define the target: + +|Field|Required|Default|Description| +|---|---|---|---| +|`name`|Y||Target name| +|`description`|N|-|Optional description| +|`type`|Y||Must be `digitaloceans3`| +|`pipelines`|N|-|Optional post-processor pipelines| +|`status`|N|`true`|Enable/disable the target| + +### DigitalOcean Spaces Credentials + +|Field|Required|Default|Description| +|---|---|---|---| +|`key`|Y|-|DigitalOcean Spaces access key| +|`secret`|Y|-|DigitalOcean Spaces secret key| +|`region`|Y|-|DigitalOcean region (e.g., `nyc3`, `sfo3`, `ams3`, `sgp1`)| +|`endpoint`|Y|-|Spaces endpoint URL (format: `https://.digitaloceanspaces.com`)| + +### Connection + +|Field|Required|Default|Description| +|---|---|---|---| +|`part_size`|N|`5`|Multipart upload part size in megabytes (minimum 5MB)| +|`timeout`|N|`30`|Connection timeout in seconds| +|`field_format`|N|-|Data normalization format. See applicable Normalization section| + +### Files + +|Field|Required|Default|Description| +|---|---|---|---| +|`bucket`|N*|-|Default Spaces bucket name (used if `buckets` not specified)| +|`buckets`|N*|-|Array of bucket configurations for file distribution| +|`buckets.bucket`|Y|-|Spaces bucket name| +|`buckets.name`|Y|-|File name template| +|`buckets.format`|N|`"json"`|Output format: `json`, `multijson`, `avro`, `parquet`| +|`buckets.compression`|N|`"zstd"`|Compression algorithm| +|`buckets.extension`|N|Matches `format`|File extension override| +|`buckets.schema`|N*|-|Schema definition file path (required for Avro and Parquet formats)| +|`name`|N|`"vmetric.{{.Timestamp}}.{{.Extension}}"`|Default file name template when `buckets` not used| +|`format`|N|`"json"`|Default output format when `buckets` not used| +|`compression`|N|`"zstd"`|Default compression when `buckets` not used| +|`extension`|N|Matches `format`|Default file extension when `buckets` not used| +|`schema`|N|-|Default schema path when `buckets` not used| +|`max_size`|N|`0`|Maximum file size in bytes before rotation| +|`batch_size`|N|`100000`|Maximum number of messages per file| + +\* = Either `bucket` or `buckets` must be specified. When using `buckets`, schema is conditionally required for Avro and Parquet formats. + +:::note +When `max_size` is reached, the current file is uploaded to Spaces and a new file is created. For unlimited file size, set the field to `0`. +::: + +### Scheduler + +|Field|Required|Default|Description| +|---|---|---|---| +|`interval`|N|realtime|Execution frequency. See Interval for details| +|`cron`|N|-|Cron expression for scheduled execution. See Cron for details| + +### Debug Options + +|Field|Required|Default|Description| +|---|---|---|---| +|`debug.status`|N|`false`|Enable debug logging| +|`debug.dont_send_logs`|N|`false`|Process logs but don't send to target (testing)| + +## Details + +The DigitalOcean Spaces target provides simple, developer-friendly object storage with comprehensive file format support. Spaces includes built-in CDN functionality powered by DigitalOcean's global network for fast content delivery. + +### Authentication + +Requires DigitalOcean Spaces access credentials. Access keys can be generated through the DigitalOcean Control Panel under API settings. Each key provides full access to all Spaces in the account. + +### Endpoint Configuration + +The endpoint URL follows the pattern `https://.digitaloceanspaces.com` where `` is your chosen DigitalOcean datacenter location. + +### Available Regions + +DigitalOcean Spaces is available in the following regions: + +|Region Code|Location| +|---|---| +|`nyc3`|New York City, USA| +|`sfo3`|San Francisco, USA| +|`ams3`|Amsterdam, Netherlands| +|`sgp1`|Singapore| +|`fra1`|Frankfurt, Germany| +|`syd1`|Sydney, Australia| + +### File Formats + +|Format|Description| +|---|---| +|`json`|Each log entry is written as a separate JSON line (JSONL format)| +|`multijson`|All log entries are written as a single JSON array| +|`avro`|Apache Avro format with schema| +|`parquet`|Apache Parquet columnar format with schema| + +### Compression + +All formats support optional compression to reduce storage costs and transfer times. Compression is applied before upload. + +|Format|Compression Options| +|---|---| +|JSON/MultiJSON|`zstd` (default), `gzip`| +|Avro|`null`, `deflate`, `snappy`, `zstd`| +|Parquet|`uncompressed`, `gzip`, `snappy`, `zstd`, `brotli`, `lz4`| + +### File Management + +Files are rotated based on size (`max_size` parameter) or event count (`batch_size` parameter), whichever limit is reached first. Template variables in file names enable dynamic file naming for time-based partitioning. + +### Templates + +The following template variables can be used in file names: + +|Variable|Description|Example| +|---|---|---| +|`{{.Year}}`|Current year|`2024`| +|`{{.Month}}`|Current month|`01`| +|`{{.Day}}`|Current day|`15`| +|`{{.Timestamp}}`|Current timestamp in nanoseconds|`1703688533123456789`| +|`{{.Format}}`|File format|`json`| +|`{{.Extension}}`|File extension|`json`| +|`{{.Compression}}`|Compression type|`zstd`| +|`{{.TargetName}}`|Target name|`my_logs`| +|`{{.TargetType}}`|Target type|`digitaloceans3`| +|`{{.Table}}`|Bucket name|`logs`| + +### Multipart Upload + +Large files automatically use multipart upload protocol with configurable part size (`part_size` parameter). Default 5MB part size balances upload efficiency and memory usage. + +### Multiple Buckets + +Single target can write to multiple Spaces buckets with different configurations, enabling data distribution strategies (e.g., raw data to one bucket, processed data to another). + +### Schema Requirements + +Avro and Parquet formats require schema definition files. Schema files must be accessible at the path specified in the `schema` parameter during target initialization. + +### CDN Integration + +DigitalOcean Spaces includes built-in CDN functionality. Files stored in Spaces can be served globally through the CDN with no additional configuration required. + +### Cost Structure + +DigitalOcean Spaces offers predictable pricing with no egress fees within reasonable limits, making it cost-effective for applications with moderate data transfer requirements. + +## Examples + +### Basic Configuration + +The minimum configuration for a JSON Spaces target: + +```yaml +targets: + - name: basic_spaces + type: digitaloceans3 + properties: + key: "DO00ABC123XYZ456" + secret: "abcdef0123456789abcdef0123456789abcdef0123456789abcdef0123456789" + region: "nyc3" + endpoint: "https://nyc3.digitaloceanspaces.com" + bucket: "datastream-logs" +``` + +### Multiple Buckets + +Configuration for distributing data across multiple Spaces buckets with different formats: + +```yaml +targets: + - name: multi_bucket_export + type: digitaloceans3 + properties: + key: "DO00ABC123XYZ456" + secret: "abcdef0123456789abcdef0123456789abcdef0123456789abcdef0123456789" + region: "sfo3" + endpoint: "https://sfo3.digitaloceanspaces.com" + buckets: + - bucket: "raw-data-archive" + name: "raw-{{.Year}}-{{.Month}}-{{.Day}}.json" + format: "multijson" + compression: "gzip" + - bucket: "analytics-data" + name: "analytics-{{.Year}}/{{.Month}}/{{.Day}}/data_{{.Timestamp}}.parquet" + format: "parquet" + schema: "" + compression: "snappy" +``` + +### Parquet Format + +Configuration for daily partitioned Parquet files: + +```yaml +targets: + - name: parquet_analytics + type: digitaloceans3 + properties: + key: "DO00ABC123XYZ456" + secret: "abcdef0123456789abcdef0123456789abcdef0123456789abcdef0123456789" + region: "ams3" + endpoint: "https://ams3.digitaloceanspaces.com" + bucket: "analytics-lake" + name: "events/year={{.Year}}/month={{.Month}}/day={{.Day}}/part-{{.Timestamp}}.parquet" + format: "parquet" + schema: "" + compression: "snappy" + max_size: 536870912 +``` + +### High Reliability + +Configuration with enhanced settings: + +```yaml +targets: + - name: reliable_spaces + type: digitaloceans3 + pipelines: + - checkpoint + properties: + key: "DO00ABC123XYZ456" + secret: "abcdef0123456789abcdef0123456789abcdef0123456789abcdef0123456789" + region: "sgp1" + endpoint: "https://sgp1.digitaloceanspaces.com" + bucket: "critical-logs" + name: "logs-{{.Timestamp}}.json" + format: "json" + timeout: 60 + part_size: 10 +``` + +### With Field Normalization + +Using field normalization for standard format: + +```yaml +targets: + - name: normalized_spaces + type: digitaloceans3 + properties: + key: "DO00ABC123XYZ456" + secret: "abcdef0123456789abcdef0123456789abcdef0123456789abcdef0123456789" + region: "fra1" + endpoint: "https://fra1.digitaloceanspaces.com" + bucket: "normalized-logs" + name: "logs-{{.Timestamp}}.json" + format: "json" + field_format: "cim" +``` + +### Debug Configuration + +Configuration with debugging enabled: + +```yaml +targets: + - name: debug_spaces + type: digitaloceans3 + properties: + key: "DO00ABC123XYZ456" + secret: "abcdef0123456789abcdef0123456789abcdef0123456789abcdef0123456789" + region: "nyc3" + endpoint: "https://nyc3.digitaloceanspaces.com" + bucket: "test-logs" + name: "test-{{.Timestamp}}.json" + format: "json" + debug: + status: true + dont_send_logs: true +``` \ No newline at end of file diff --git a/docs/configuration/targets/ibm-cos.mdx b/docs/configuration/targets/ibm-cos.mdx new file mode 100644 index 00000000..4a2c0e22 --- /dev/null +++ b/docs/configuration/targets/ibm-cos.mdx @@ -0,0 +1,321 @@ +# IBM Cloud Object Storage + +IBM CloudLong Term Storage + +## Synopsis + +Creates a target that writes log messages to IBM Cloud Object Storage buckets with support for various file formats and authentication methods. The target handles large file uploads efficiently with configurable rotation based on size or event count. IBM Cloud Object Storage provides enterprise-grade durability, security, and global availability. + +## Schema + +```yaml {1,3} +- name: + description: + type: ibmcos + pipelines: + status: + properties: + key: + secret: + region: + endpoint: + part_size: + bucket: + buckets: + - bucket: + name: + format: + compression: + extension: + schema: + name: + format: + compression: + extension: + schema: + max_size: + batch_size: + timeout: + field_format: + interval: + cron: + debug: + status: + dont_send_logs: +``` + +## Configuration + +The following fields are used to define the target: + +|Field|Required|Default|Description| +|---|---|---|---| +|`name`|Y||Target name| +|`description`|N|-|Optional description| +|`type`|Y||Must be `ibmcos`| +|`pipelines`|N|-|Optional post-processor pipelines| +|`status`|N|`true`|Enable/disable the target| + +### IBM Cloud Object Storage Credentials + +|Field|Required|Default|Description| +|---|---|---|---| +|`key`|Y|-|IBM Cloud Object Storage HMAC access key ID| +|`secret`|Y|-|IBM Cloud Object Storage HMAC secret access key| +|`region`|Y|-|IBM Cloud region (e.g., `us-south`, `eu-gb`, `jp-tok`)| +|`endpoint`|Y|-|IBM COS endpoint URL (e.g., `https://s3.us-south.cloud-object-storage.appdomain.cloud`)| + +### Connection + +|Field|Required|Default|Description| +|---|---|---|---| +|`part_size`|N|`5`|Multipart upload part size in megabytes (minimum 5MB)| +|`timeout`|N|`30`|Connection timeout in seconds| +|`field_format`|N|-|Data normalization format. See applicable Normalization section| + +### Files + +|Field|Required|Default|Description| +|---|---|---|---| +|`bucket`|N*|-|Default IBM COS bucket name (used if `buckets` not specified)| +|`buckets`|N*|-|Array of bucket configurations for file distribution| +|`buckets.bucket`|Y|-|IBM COS bucket name| +|`buckets.name`|Y|-|File name template| +|`buckets.format`|N|`"json"`|Output format: `json`, `multijson`, `avro`, `parquet`| +|`buckets.compression`|N|`"zstd"`|Compression algorithm| +|`buckets.extension`|N|Matches `format`|File extension override| +|`buckets.schema`|N*|-|Schema definition file path (required for Avro and Parquet formats)| +|`name`|N|`"vmetric.{{.Timestamp}}.{{.Extension}}"`|Default file name template when `buckets` not used| +|`format`|N|`"json"`|Default output format when `buckets` not used| +|`compression`|N|`"zstd"`|Default compression when `buckets` not used| +|`extension`|N|Matches `format`|Default file extension when `buckets` not used| +|`schema`|N|-|Default schema path when `buckets` not used| +|`max_size`|N|`0`|Maximum file size in bytes before rotation| +|`batch_size`|N|`100000`|Maximum number of messages per file| + +\* = Either `bucket` or `buckets` must be specified. When using `buckets`, schema is conditionally required for Avro and Parquet formats. + +:::note +When `max_size` is reached, the current file is uploaded to IBM COS and a new file is created. For unlimited file size, set the field to `0`. +::: + +### Scheduler + +|Field|Required|Default|Description| +|---|---|---|---| +|`interval`|N|realtime|Execution frequency. See Interval for details| +|`cron`|N|-|Cron expression for scheduled execution. See Cron for details| + +### Debug Options + +|Field|Required|Default|Description| +|---|---|---|---| +|`debug.status`|N|`false`|Enable debug logging| +|`debug.dont_send_logs`|N|`false`|Process logs but don't send to target (testing)| + +## Details + +The IBM Cloud Object Storage target provides enterprise-grade cloud storage integration with comprehensive file format support. IBM COS offers high durability (99.999999999%), security features, and flexible storage classes for cost optimization. + +### Authentication + +Requires IBM Cloud Object Storage HMAC credentials. HMAC credentials can be created through the IBM Cloud console and provide programmatic access to COS buckets. + +### Endpoint Configuration + +IBM Cloud Object Storage uses region-specific endpoints. The endpoint format is typically `https://s3..cloud-object-storage.appdomain.cloud` where `` corresponds to your chosen IBM Cloud region. + +### Available Regions + +IBM Cloud Object Storage is available in multiple regions worldwide: + +|Region Code|Location| +|---|---| +|`us-south`|Dallas, USA| +|`us-east`|Washington DC, USA| +|`eu-gb`|London, UK| +|`eu-de`|Frankfurt, Germany| +|`jp-tok`|Tokyo, Japan| +|`au-syd`|Sydney, Australia| + +### File Formats + +|Format|Description| +|---|---| +|`json`|Each log entry is written as a separate JSON line (JSONL format)| +|`multijson`|All log entries are written as a single JSON array| +|`avro`|Apache Avro format with schema| +|`parquet`|Apache Parquet columnar format with schema| + +### Compression + +All formats support optional compression to reduce storage costs and transfer times. Compression is applied before upload. + +|Format|Compression Options| +|---|---| +|JSON/MultiJSON|`zstd` (default), `gzip`| +|Avro|`null`, `deflate`, `snappy`, `zstd`| +|Parquet|`uncompressed`, `gzip`, `snappy`, `zstd`, `brotli`, `lz4`| + +### File Management + +Files are rotated based on size (`max_size` parameter) or event count (`batch_size` parameter), whichever limit is reached first. Template variables in file names enable dynamic file naming for time-based partitioning. + +### Templates + +The following template variables can be used in file names: + +|Variable|Description|Example| +|---|---|---| +|`{{.Year}}`|Current year|`2024`| +|`{{.Month}}`|Current month|`01`| +|`{{.Day}}`|Current day|`15`| +|`{{.Timestamp}}`|Current timestamp in nanoseconds|`1703688533123456789`| +|`{{.Format}}`|File format|`json`| +|`{{.Extension}}`|File extension|`json`| +|`{{.Compression}}`|Compression type|`zstd`| +|`{{.TargetName}}`|Target name|`my_logs`| +|`{{.TargetType}}`|Target type|`ibmcos`| +|`{{.Table}}`|Bucket name|`logs`| + +### Multipart Upload + +Large files automatically use multipart upload protocol with configurable part size (`part_size` parameter). Default 5MB part size balances upload efficiency and memory usage. + +### Multiple Buckets + +Single target can write to multiple IBM COS buckets with different configurations, enabling data distribution strategies (e.g., raw data to one bucket, processed data to another). + +### Schema Requirements + +Avro and Parquet formats require schema definition files. Schema files must be accessible at the path specified in the `schema` parameter during target initialization. + +### Storage Classes + +IBM Cloud Object Storage supports multiple storage classes for cost optimization. Choose the appropriate class based on data access patterns and retention requirements. + +## Examples + +### Basic Configuration + +The minimum configuration for a JSON IBM COS target: + +```yaml +targets: + - name: basic_ibm_cos + type: ibmcos + properties: + key: "0123456789abcdef0123456789abcdef" + secret: "fedcba9876543210fedcba9876543210fedcba98" + region: "us-south" + endpoint: "https://s3.us-south.cloud-object-storage.appdomain.cloud" + bucket: "datastream-logs" +``` + +### Multiple Buckets + +Configuration for distributing data across multiple IBM COS buckets with different formats: + +```yaml +targets: + - name: multi_bucket_export + type: ibmcos + properties: + key: "0123456789abcdef0123456789abcdef" + secret: "fedcba9876543210fedcba9876543210fedcba98" + region: "eu-gb" + endpoint: "https://s3.eu-gb.cloud-object-storage.appdomain.cloud" + buckets: + - bucket: "raw-data-archive" + name: "raw-{{.Year}}-{{.Month}}-{{.Day}}.json" + format: "multijson" + compression: "gzip" + - bucket: "analytics-data" + name: "analytics-{{.Year}}/{{.Month}}/{{.Day}}/data_{{.Timestamp}}.parquet" + format: "parquet" + schema: "" + compression: "snappy" +``` + +### Parquet Format + +Configuration for daily partitioned Parquet files: + +```yaml +targets: + - name: parquet_analytics + type: ibmcos + properties: + key: "0123456789abcdef0123456789abcdef" + secret: "fedcba9876543210fedcba9876543210fedcba98" + region: "jp-tok" + endpoint: "https://s3.jp-tok.cloud-object-storage.appdomain.cloud" + bucket: "analytics-lake" + name: "events/year={{.Year}}/month={{.Month}}/day={{.Day}}/part-{{.Timestamp}}.parquet" + format: "parquet" + schema: "" + compression: "snappy" + max_size: 536870912 +``` + +### High Reliability + +Configuration with enhanced settings: + +```yaml +targets: + - name: reliable_ibm_cos + type: ibmcos + pipelines: + - checkpoint + properties: + key: "0123456789abcdef0123456789abcdef" + secret: "fedcba9876543210fedcba9876543210fedcba98" + region: "us-east" + endpoint: "https://s3.us-east.cloud-object-storage.appdomain.cloud" + bucket: "critical-logs" + name: "logs-{{.Timestamp}}.json" + format: "json" + timeout: 60 + part_size: 10 +``` + +### With Field Normalization + +Using field normalization for standard format: + +```yaml +targets: + - name: normalized_ibm_cos + type: ibmcos + properties: + key: "0123456789abcdef0123456789abcdef" + secret: "fedcba9876543210fedcba9876543210fedcba98" + region: "eu-de" + endpoint: "https://s3.eu-de.cloud-object-storage.appdomain.cloud" + bucket: "normalized-logs" + name: "logs-{{.Timestamp}}.json" + format: "json" + field_format: "cim" +``` + +### Debug Configuration + +Configuration with debugging enabled: + +```yaml +targets: + - name: debug_ibm_cos + type: ibmcos + properties: + key: "0123456789abcdef0123456789abcdef" + secret: "fedcba9876543210fedcba9876543210fedcba98" + region: "au-syd" + endpoint: "https://s3.au-syd.cloud-object-storage.appdomain.cloud" + bucket: "test-logs" + name: "test-{{.Timestamp}}.json" + format: "json" + debug: + status: true + dont_send_logs: true +``` \ No newline at end of file diff --git a/docs/configuration/targets/minio.mdx b/docs/configuration/targets/minio.mdx new file mode 100644 index 00000000..472e3db3 --- /dev/null +++ b/docs/configuration/targets/minio.mdx @@ -0,0 +1,324 @@ +# MinIO + +MinIOLong Term Storage + +## Synopsis + +Creates a target that writes log messages to MinIO object storage with support for various file formats and authentication methods. The target handles large file uploads efficiently with configurable rotation based on size or event count. MinIO provides high-performance, Kubernetes-native object storage that can be deployed on-premises or in the cloud. + +## Schema + +```yaml {1,3} +- name: + description: + type: minio + pipelines: + status: + properties: + key: + secret: + region: + endpoint: + part_size: + bucket: + buckets: + - bucket: + name: + format: + compression: + extension: + schema: + name: + format: + compression: + extension: + schema: + max_size: + batch_size: + timeout: + field_format: + interval: + cron: + debug: + status: + dont_send_logs: +``` + +## Configuration + +The following fields are used to define the target: + +|Field|Required|Default|Description| +|---|---|---|---| +|`name`|Y||Target name| +|`description`|N|-|Optional description| +|`type`|Y||Must be `minio`| +|`pipelines`|N|-|Optional post-processor pipelines| +|`status`|N|`true`|Enable/disable the target| + +### MinIO Credentials + +|Field|Required|Default|Description| +|---|---|---|---| +|`key`|Y|-|MinIO access key| +|`secret`|Y|-|MinIO secret key| +|`region`|N|`us-east-1`|MinIO region (default if not using multiple regions)| +|`endpoint`|Y|-|MinIO server endpoint URL (e.g., `http://minio-server:9000` or `https://minio.example.com`)| + +### Connection + +|Field|Required|Default|Description| +|---|---|---|---| +|`part_size`|N|`5`|Multipart upload part size in megabytes (minimum 5MB)| +|`timeout`|N|`30`|Connection timeout in seconds| +|`field_format`|N|-|Data normalization format. See applicable Normalization section| + +### Files + +|Field|Required|Default|Description| +|---|---|---|---| +|`bucket`|N*|-|Default MinIO bucket name (used if `buckets` not specified)| +|`buckets`|N*|-|Array of bucket configurations for file distribution| +|`buckets.bucket`|Y|-|MinIO bucket name| +|`buckets.name`|Y|-|File name template| +|`buckets.format`|N|`"json"`|Output format: `json`, `multijson`, `avro`, `parquet`| +|`buckets.compression`|N|`"zstd"`|Compression algorithm| +|`buckets.extension`|N|Matches `format`|File extension override| +|`buckets.schema`|N*|-|Schema definition file path (required for Avro and Parquet formats)| +|`name`|N|`"vmetric.{{.Timestamp}}.{{.Extension}}"`|Default file name template when `buckets` not used| +|`format`|N|`"json"`|Default output format when `buckets` not used| +|`compression`|N|`"zstd"`|Default compression when `buckets` not used| +|`extension`|N|Matches `format`|Default file extension when `buckets` not used| +|`schema`|N|-|Default schema path when `buckets` not used| +|`max_size`|N|`0`|Maximum file size in bytes before rotation| +|`batch_size`|N|`100000`|Maximum number of messages per file| + +\* = Either `bucket` or `buckets` must be specified. When using `buckets`, schema is conditionally required for Avro and Parquet formats. + +:::note +When `max_size` is reached, the current file is uploaded to MinIO and a new file is created. For unlimited file size, set the field to `0`. +::: + +### Scheduler + +|Field|Required|Default|Description| +|---|---|---|---| +|`interval`|N|realtime|Execution frequency. See Interval for details| +|`cron`|N|-|Cron expression for scheduled execution. See Cron for details| + +### Debug Options + +|Field|Required|Default|Description| +|---|---|---|---| +|`debug.status`|N|`false`|Enable debug logging| +|`debug.dont_send_logs`|N|`false`|Process logs but don't send to target (testing)| + +## Details + +The MinIO target provides high-performance object storage integration with comprehensive file format support. MinIO is designed for private cloud infrastructure and offers enterprise features like versioning, lifecycle management, and replication. + +### Authentication + +Requires MinIO access credentials. Access keys can be created through the MinIO Console or mc (MinIO Client) command-line tool. Keys can be scoped with specific permissions and bucket access policies. + +### Endpoint Configuration + +The endpoint URL points to your MinIO server deployment. This can be a local deployment (e.g., `http://localhost:9000`), an internal network address (e.g., `http://minio-service:9000`), or a public domain with TLS (e.g., `https://minio.example.com`). + +### Deployment Flexibility + +MinIO can be deployed in various environments including on-premises data centers, private clouds, public clouds, edge locations, and Kubernetes clusters. The target works seamlessly with any MinIO deployment. + +### File Formats + +|Format|Description| +|---|---| +|`json`|Each log entry is written as a separate JSON line (JSONL format)| +|`multijson`|All log entries are written as a single JSON array| +|`avro`|Apache Avro format with schema| +|`parquet`|Apache Parquet columnar format with schema| + +### Compression + +All formats support optional compression to reduce storage costs and transfer times. Compression is applied before upload. + +|Format|Compression Options| +|---|---| +|JSON/MultiJSON|`zstd` (default), `gzip`| +|Avro|`null`, `deflate`, `snappy`, `zstd`| +|Parquet|`uncompressed`, `gzip`, `snappy`, `zstd`, `brotli`, `lz4`| + +### File Management + +Files are rotated based on size (`max_size` parameter) or event count (`batch_size` parameter), whichever limit is reached first. Template variables in file names enable dynamic file naming for time-based partitioning. + +### Templates + +The following template variables can be used in file names: + +|Variable|Description|Example| +|---|---|---| +|`{{.Year}}`|Current year|`2024`| +|`{{.Month}}`|Current month|`01`| +|`{{.Day}}`|Current day|`15`| +|`{{.Timestamp}}`|Current timestamp in nanoseconds|`1703688533123456789`| +|`{{.Format}}`|File format|`json`| +|`{{.Extension}}`|File extension|`json`| +|`{{.Compression}}`|Compression type|`zstd`| +|`{{.TargetName}}`|Target name|`my_logs`| +|`{{.TargetType}}`|Target type|`minio`| +|`{{.Table}}`|Bucket name|`logs`| + +### Multipart Upload + +Large files automatically use multipart upload protocol with configurable part size (`part_size` parameter). Default 5MB part size balances upload efficiency and memory usage. + +### Multiple Buckets + +Single target can write to multiple MinIO buckets with different configurations, enabling data distribution strategies (e.g., raw data to one bucket, processed data to another). + +### Schema Requirements + +Avro and Parquet formats require schema definition files. Schema files must be accessible at the path specified in the `schema` parameter during target initialization. + +### Performance Optimization + +MinIO is optimized for high-throughput workloads and can handle concurrent uploads efficiently. Consider using erasure coding and distributed mode for enhanced performance and reliability. + +## Examples + +### Basic Configuration + +The minimum configuration for a JSON MinIO target: + +```yaml +targets: + - name: basic_minio + type: minio + properties: + key: "minioadmin" + secret: "minioadmin" + endpoint: "http://minio-server:9000" + bucket: "datastream-logs" +``` + +### Multiple Buckets + +Configuration for distributing data across multiple MinIO buckets with different formats: + +```yaml +targets: + - name: multi_bucket_export + type: minio + properties: + key: "analytics-user" + secret: "analytics-password-secure" + endpoint: "https://minio.example.com" + buckets: + - bucket: "raw-data-archive" + name: "raw-{{.Year}}-{{.Month}}-{{.Day}}.json" + format: "multijson" + compression: "gzip" + - bucket: "analytics-data" + name: "analytics-{{.Year}}/{{.Month}}/{{.Day}}/data_{{.Timestamp}}.parquet" + format: "parquet" + schema: "" + compression: "snappy" +``` + +### Parquet Format + +Configuration for daily partitioned Parquet files: + +```yaml +targets: + - name: parquet_analytics + type: minio + properties: + key: "analytics-user" + secret: "analytics-password-secure" + endpoint: "http://10.0.1.100:9000" + bucket: "analytics-lake" + name: "events/year={{.Year}}/month={{.Month}}/day={{.Day}}/part-{{.Timestamp}}.parquet" + format: "parquet" + schema: "" + compression: "snappy" + max_size: 536870912 +``` + +### High Reliability + +Configuration with enhanced settings: + +```yaml +targets: + - name: reliable_minio + type: minio + pipelines: + - checkpoint + properties: + key: "backup-user" + secret: "backup-password-secure" + endpoint: "https://minio-backup.example.com" + bucket: "critical-logs" + name: "logs-{{.Timestamp}}.json" + format: "json" + timeout: 60 + part_size: 10 +``` + +### With Field Normalization + +Using field normalization for standard format: + +```yaml +targets: + - name: normalized_minio + type: minio + properties: + key: "normalized-user" + secret: "normalized-password-secure" + endpoint: "http://minio.internal:9000" + bucket: "normalized-logs" + name: "logs-{{.Timestamp}}.json" + format: "json" + field_format: "cim" +``` + +### Debug Configuration + +Configuration with debugging enabled: + +```yaml +targets: + - name: debug_minio + type: minio + properties: + key: "test-user" + secret: "test-password" + endpoint: "http://localhost:9000" + bucket: "test-logs" + name: "test-{{.Timestamp}}.json" + format: "json" + debug: + status: true + dont_send_logs: true +``` + +### Kubernetes Deployment + +Configuration for MinIO deployed in Kubernetes: + +```yaml +targets: + - name: k8s_minio + type: minio + properties: + key: "k8s-minio-user" + secret: "k8s-minio-password-secure" + endpoint: "http://minio.default.svc.cluster.local:9000" + bucket: "application-logs" + name: "logs/{{.Year}}/{{.Month}}/{{.Day}}/{{.Timestamp}}.json" + format: "json" + compression: "zstd" +``` \ No newline at end of file diff --git a/docs/configuration/targets/oracle-cloud-os.mdx b/docs/configuration/targets/oracle-cloud-os.mdx new file mode 100644 index 00000000..8a6954a4 --- /dev/null +++ b/docs/configuration/targets/oracle-cloud-os.mdx @@ -0,0 +1,342 @@ +# Oracle Cloud Infrastructure Object Storage + +Oracle CloudLong Term Storage + +## Synopsis + +Creates a target that writes log messages to Oracle Cloud Infrastructure (OCI) Object Storage with support for various file formats and authentication methods. The target handles large file uploads efficiently with configurable rotation based on size or event count. OCI Object Storage provides enterprise-grade durability, security, and performance with strong integration into Oracle's cloud ecosystem. + +## Schema + +```yaml {1,3} +- name: + description: + type: oracles3 + pipelines: + status: + properties: + key: + secret: + region: + endpoint: + part_size: + bucket: + buckets: + - bucket: + name: + format: + compression: + extension: + schema: + name: + format: + compression: + extension: + schema: + max_size: + batch_size: + timeout: + field_format: + interval: + cron: + debug: + status: + dont_send_logs: +``` + +## Configuration + +The following fields are used to define the target: + +|Field|Required|Default|Description| +|---|---|---|---| +|`name`|Y||Target name| +|`description`|N|-|Optional description| +|`type`|Y||Must be `oracles3`| +|`pipelines`|N|-|Optional post-processor pipelines| +|`status`|N|`true`|Enable/disable the target| + +### Oracle Cloud Object Storage Credentials + +|Field|Required|Default|Description| +|---|---|---|---| +|`key`|Y|-|Oracle Cloud access key ID for Customer Secret Keys| +|`secret`|Y|-|Oracle Cloud secret access key| +|`region`|Y|-|OCI region identifier (e.g., `us-ashburn-1`, `eu-frankfurt-1`, `ap-tokyo-1`)| +|`endpoint`|Y|-|OCI Object Storage endpoint (format: `https://.compat.objectstorage..oraclecloud.com`)| + +### Connection + +|Field|Required|Default|Description| +|---|---|---|---| +|`part_size`|N|`5`|Multipart upload part size in megabytes (minimum 5MB)| +|`timeout`|N|`30`|Connection timeout in seconds| +|`field_format`|N|-|Data normalization format. See applicable Normalization section| + +### Files + +|Field|Required|Default|Description| +|---|---|---|---| +|`bucket`|N*|-|Default OCI bucket name (used if `buckets` not specified)| +|`buckets`|N*|-|Array of bucket configurations for file distribution| +|`buckets.bucket`|Y|-|OCI bucket name| +|`buckets.name`|Y|-|File name template| +|`buckets.format`|N|`"json"`|Output format: `json`, `multijson`, `avro`, `parquet`| +|`buckets.compression`|N|`"zstd"`|Compression algorithm| +|`buckets.extension`|N|Matches `format`|File extension override| +|`buckets.schema`|N*|-|Schema definition file path (required for Avro and Parquet formats)| +|`name`|N|`"vmetric.{{.Timestamp}}.{{.Extension}}"`|Default file name template when `buckets` not used| +|`format`|N|`"json"`|Default output format when `buckets` not used| +|`compression`|N|`"zstd"`|Default compression when `buckets` not used| +|`extension`|N|Matches `format`|Default file extension when `buckets` not used| +|`schema`|N|-|Default schema path when `buckets` not used| +|`max_size`|N|`0`|Maximum file size in bytes before rotation| +|`batch_size`|N|`100000`|Maximum number of messages per file| + +\* = Either `bucket` or `buckets` must be specified. When using `buckets`, schema is conditionally required for Avro and Parquet formats. + +:::note +When `max_size` is reached, the current file is uploaded to OCI Object Storage and a new file is created. For unlimited file size, set the field to `0`. +::: + +### Scheduler + +|Field|Required|Default|Description| +|---|---|---|---| +|`interval`|N|realtime|Execution frequency. See Interval for details| +|`cron`|N|-|Cron expression for scheduled execution. See Cron for details| + +### Debug Options + +|Field|Required|Default|Description| +|---|---|---|---| +|`debug.status`|N|`false`|Enable debug logging| +|`debug.dont_send_logs`|N|`false`|Process logs but don't send to target (testing)| + +## Details + +The Oracle Cloud Infrastructure Object Storage target provides enterprise-grade cloud storage integration with comprehensive file format support. OCI Object Storage offers strong data durability, automatic encryption, and seamless integration with Oracle Database and analytics services. + +### Authentication + +Requires OCI Customer Secret Keys for S3 compatibility. Customer Secret Keys can be generated through the OCI Console under User Settings. Each user can have up to two active Customer Secret Keys at a time. + +### Endpoint Configuration + +The endpoint URL follows the pattern `https://.compat.objectstorage..oraclecloud.com` where `` is your OCI Object Storage namespace (typically your tenancy name) and `` is your chosen OCI region identifier. + +### Finding Your Namespace + +Your Object Storage namespace can be found in the OCI Console under Tenancy Details or by using the OCI CLI command `oci os ns get`. + +### Available Regions + +Oracle Cloud Infrastructure is available in numerous regions worldwide: + +|Region Code|Location| +|---|---| +|`us-ashburn-1`|US East (Ashburn)| +|`us-phoenix-1`|US West (Phoenix)| +|`us-sanjose-1`|US West (San Jose)| +|`ca-toronto-1`|Canada Southeast (Toronto)| +|`ca-montreal-1`|Canada Southeast (Montreal)| +|`eu-frankfurt-1`|Germany Central (Frankfurt)| +|`eu-zurich-1`|Switzerland North (Zurich)| +|`eu-amsterdam-1`|Netherlands Northwest (Amsterdam)| +|`uk-london-1`|UK South (London)| +|`ap-tokyo-1`|Japan East (Tokyo)| +|`ap-osaka-1`|Japan Central (Osaka)| +|`ap-seoul-1`|South Korea Central (Seoul)| +|`ap-mumbai-1`|India West (Mumbai)| +|`ap-hyderabad-1`|India South (Hyderabad)| +|`ap-sydney-1`|Australia East (Sydney)| +|`ap-melbourne-1`|Australia Southeast (Melbourne)| +|`sa-saopaulo-1`|Brazil East (Sao Paulo)| +|`me-jeddah-1`|Saudi Arabia West (Jeddah)| +|`me-dubai-1`|UAE East (Dubai)| + +### File Formats + +|Format|Description| +|---|---| +|`json`|Each log entry is written as a separate JSON line (JSONL format)| +|`multijson`|All log entries are written as a single JSON array| +|`avro`|Apache Avro format with schema| +|`parquet`|Apache Parquet columnar format with schema| + +### Compression + +All formats support optional compression to reduce storage costs and transfer times. Compression is applied before upload. + +|Format|Compression Options| +|---|---| +|JSON/MultiJSON|`zstd` (default), `gzip`| +|Avro|`null`, `deflate`, `snappy`, `zstd`| +|Parquet|`uncompressed`, `gzip`, `snappy`, `zstd`, `brotli`, `lz4`| + +### File Management + +Files are rotated based on size (`max_size` parameter) or event count (`batch_size` parameter), whichever limit is reached first. Template variables in file names enable dynamic file naming for time-based partitioning. + +### Templates + +The following template variables can be used in file names: + +|Variable|Description|Example| +|---|---|---| +|`{{.Year}}`|Current year|`2024`| +|`{{.Month}}`|Current month|`01`| +|`{{.Day}}`|Current day|`15`| +|`{{.Timestamp}}`|Current timestamp in nanoseconds|`1703688533123456789`| +|`{{.Format}}`|File format|`json`| +|`{{.Extension}}`|File extension|`json`| +|`{{.Compression}}`|Compression type|`zstd`| +|`{{.TargetName}}`|Target name|`my_logs`| +|`{{.TargetType}}`|Target type|`oracles3`| +|`{{.Table}}`|Bucket name|`logs`| + +### Multipart Upload + +Large files automatically use multipart upload protocol with configurable part size (`part_size` parameter). Default 5MB part size balances upload efficiency and memory usage. + +### Multiple Buckets + +Single target can write to multiple OCI buckets with different configurations, enabling data distribution strategies (e.g., raw data to one bucket, processed data to another). + +### Schema Requirements + +Avro and Parquet formats require schema definition files. Schema files must be accessible at the path specified in the `schema` parameter during target initialization. + +### Storage Tiers + +OCI Object Storage supports multiple storage tiers including Standard, Infrequent Access, and Archive for cost optimization based on access patterns and retention requirements. + +### Integration with Oracle Services + +OCI Object Storage integrates seamlessly with Oracle Autonomous Database, Oracle Analytics Cloud, and other Oracle Cloud services for comprehensive data processing pipelines. + +## Examples + +### Basic Configuration + +The minimum configuration for a JSON OCI Object Storage target: + +```yaml +targets: + - name: basic_oci + type: oracles3 + properties: + key: "0a1b2c3d4e5f6g7h8i9j0k1l2m3n4o5p" + secret: "AbCdEfGhIjKlMnOpQrStUvWxYz0123456789+/==" + region: "us-ashburn-1" + endpoint: "https://mytenancy.compat.objectstorage.us-ashburn-1.oraclecloud.com" + bucket: "datastream-logs" +``` + +### Multiple Buckets + +Configuration for distributing data across multiple OCI buckets with different formats: + +```yaml +targets: + - name: multi_bucket_export + type: oracles3 + properties: + key: "0a1b2c3d4e5f6g7h8i9j0k1l2m3n4o5p" + secret: "AbCdEfGhIjKlMnOpQrStUvWxYz0123456789+/==" + region: "eu-frankfurt-1" + endpoint: "https://mytenancy.compat.objectstorage.eu-frankfurt-1.oraclecloud.com" + buckets: + - bucket: "raw-data-archive" + name: "raw-{{.Year}}-{{.Month}}-{{.Day}}.json" + format: "multijson" + compression: "gzip" + - bucket: "analytics-data" + name: "analytics-{{.Year}}/{{.Month}}/{{.Day}}/data_{{.Timestamp}}.parquet" + format: "parquet" + schema: "" + compression: "snappy" +``` + +### Parquet Format + +Configuration for daily partitioned Parquet files: + +```yaml +targets: + - name: parquet_analytics + type: oracles3 + properties: + key: "0a1b2c3d4e5f6g7h8i9j0k1l2m3n4o5p" + secret: "AbCdEfGhIjKlMnOpQrStUvWxYz0123456789+/==" + region: "ap-tokyo-1" + endpoint: "https://mytenancy.compat.objectstorage.ap-tokyo-1.oraclecloud.com" + bucket: "analytics-lake" + name: "events/year={{.Year}}/month={{.Month}}/day={{.Day}}/part-{{.Timestamp}}.parquet" + format: "parquet" + schema: "" + compression: "snappy" + max_size: 536870912 +``` + +### High Reliability + +Configuration with enhanced settings: + +```yaml +targets: + - name: reliable_oci + type: oracles3 + pipelines: + - checkpoint + properties: + key: "0a1b2c3d4e5f6g7h8i9j0k1l2m3n4o5p" + secret: "AbCdEfGhIjKlMnOpQrStUvWxYz0123456789+/==" + region: "uk-london-1" + endpoint: "https://mytenancy.compat.objectstorage.uk-london-1.oraclecloud.com" + bucket: "critical-logs" + name: "logs-{{.Timestamp}}.json" + format: "json" + timeout: 60 + part_size: 10 +``` + +### With Field Normalization + +Using field normalization for standard format: + +```yaml +targets: + - name: normalized_oci + type: oracles3 + properties: + key: "0a1b2c3d4e5f6g7h8i9j0k1l2m3n4o5p" + secret: "AbCdEfGhIjKlMnOpQrStUvWxYz0123456789+/==" + region: "ap-sydney-1" + endpoint: "https://mytenancy.compat.objectstorage.ap-sydney-1.oraclecloud.com" + bucket: "normalized-logs" + name: "logs-{{.Timestamp}}.json" + format: "json" + field_format: "cim" +``` + +### Debug Configuration + +Configuration with debugging enabled: + +```yaml +targets: + - name: debug_oci + type: oracles3 + properties: + key: "0a1b2c3d4e5f6g7h8i9j0k1l2m3n4o5p" + secret: "AbCdEfGhIjKlMnOpQrStUvWxYz0123456789+/==" + region: "us-phoenix-1" + endpoint: "https://mytenancy.compat.objectstorage.us-phoenix-1.oraclecloud.com" + bucket: "test-logs" + name: "test-{{.Timestamp}}.json" + format: "json" + debug: + status: true + dont_send_logs: true +``` \ No newline at end of file diff --git a/docs/configuration/targets/scaleway-os.mdx b/docs/configuration/targets/scaleway-os.mdx new file mode 100644 index 00000000..b979afb1 --- /dev/null +++ b/docs/configuration/targets/scaleway-os.mdx @@ -0,0 +1,346 @@ +# Scaleway Object Storage + +ScalewayLong Term Storage + +## Synopsis + +Creates a target that writes log messages to Scaleway Object Storage with support for various file formats and authentication methods. The target handles large file uploads efficiently with configurable rotation based on size or event count. Scaleway Object Storage provides European-based, GDPR-compliant cloud storage with multi-region availability and competitive pricing. + +## Schema + +```yaml {1,3} +- name: + description: + type: scaleways3 + pipelines: + status: + properties: + key: + secret: + region: + endpoint: + part_size: + bucket: + buckets: + - bucket: + name: + format: + compression: + extension: + schema: + name: + format: + compression: + extension: + schema: + max_size: + batch_size: + timeout: + field_format: + interval: + cron: + debug: + status: + dont_send_logs: +``` + +## Configuration + +The following fields are used to define the target: + +|Field|Required|Default|Description| +|---|---|---|---| +|`name`|Y||Target name| +|`description`|N|-|Optional description| +|`type`|Y||Must be `scaleways3`| +|`pipelines`|N|-|Optional post-processor pipelines| +|`status`|N|`true`|Enable/disable the target| + +### Scaleway Object Storage Credentials + +|Field|Required|Default|Description| +|---|---|---|---| +|`key`|Y|-|Scaleway Object Storage access key| +|`secret`|Y|-|Scaleway Object Storage secret key| +|`region`|Y|-|Scaleway region (e.g., `fr-par`, `nl-ams`, `pl-waw`)| +|`endpoint`|Y|-|Scaleway Object Storage endpoint (format: `https://s3..scw.cloud`)| + +### Connection + +|Field|Required|Default|Description| +|---|---|---|---| +|`part_size`|N|`5`|Multipart upload part size in megabytes (minimum 5MB)| +|`timeout`|N|`30`|Connection timeout in seconds| +|`field_format`|N|-|Data normalization format. See applicable Normalization section| + +### Files + +|Field|Required|Default|Description| +|---|---|---|---| +|`bucket`|N*|-|Default Scaleway bucket name (used if `buckets` not specified)| +|`buckets`|N*|-|Array of bucket configurations for file distribution| +|`buckets.bucket`|Y|-|Scaleway bucket name| +|`buckets.name`|Y|-|File name template| +|`buckets.format`|N|`"json"`|Output format: `json`, `multijson`, `avro`, `parquet`| +|`buckets.compression`|N|`"zstd"`|Compression algorithm| +|`buckets.extension`|N|Matches `format`|File extension override| +|`buckets.schema`|N*|-|Schema definition file path (required for Avro and Parquet formats)| +|`name`|N|`"vmetric.{{.Timestamp}}.{{.Extension}}"`|Default file name template when `buckets` not used| +|`format`|N|`"json"`|Default output format when `buckets` not used| +|`compression`|N|`"zstd"`|Default compression when `buckets` not used| +|`extension`|N|Matches `format`|Default file extension when `buckets` not used| +|`schema`|N|-|Default schema path when `buckets` not used| +|`max_size`|N|`0`|Maximum file size in bytes before rotation| +|`batch_size`|N|`100000`|Maximum number of messages per file| + +\* = Either `bucket` or `buckets` must be specified. When using `buckets`, schema is conditionally required for Avro and Parquet formats. + +:::note +When `max_size` is reached, the current file is uploaded to Scaleway Object Storage and a new file is created. For unlimited file size, set the field to `0`. +::: + +### Scheduler + +|Field|Required|Default|Description| +|---|---|---|---| +|`interval`|N|realtime|Execution frequency. See Interval for details| +|`cron`|N|-|Cron expression for scheduled execution. See Cron for details| + +### Debug Options + +|Field|Required|Default|Description| +|---|---|---|---| +|`debug.status`|N|`false`|Enable debug logging| +|`debug.dont_send_logs`|N|`false`|Process logs but don't send to target (testing)| + +## Details + +The Scaleway Object Storage target provides European-focused cloud storage integration with comprehensive file format support. Scaleway's infrastructure is entirely based in Europe, making it an ideal choice for organizations requiring GDPR compliance and data sovereignty. + +### Authentication + +Requires Scaleway Object Storage API credentials. API keys can be generated through the Scaleway Console under Credentials. Keys can be scoped with specific permissions and access policies for security. + +### Endpoint Configuration + +The endpoint URL follows the pattern `https://s3..scw.cloud` where `` is your chosen Scaleway region identifier. Each bucket is created in a specific region. + +### Available Regions + +Scaleway Object Storage is available in the following European regions: + +|Region Code|Location| +|---|---| +|`fr-par`|France (Paris)| +|`nl-ams`|Netherlands (Amsterdam)| +|`pl-waw`|Poland (Warsaw)| + +### File Formats + +|Format|Description| +|---|---| +|`json`|Each log entry is written as a separate JSON line (JSONL format)| +|`multijson`|All log entries are written as a single JSON array| +|`avro`|Apache Avro format with schema| +|`parquet`|Apache Parquet columnar format with schema| + +### Compression + +All formats support optional compression to reduce storage costs and transfer times. Compression is applied before upload. + +|Format|Compression Options| +|---|---| +|JSON/MultiJSON|`zstd` (default), `gzip`| +|Avro|`null`, `deflate`, `snappy`, `zstd`| +|Parquet|`uncompressed`, `gzip`, `snappy`, `zstd`, `brotli`, `lz4`| + +### File Management + +Files are rotated based on size (`max_size` parameter) or event count (`batch_size` parameter), whichever limit is reached first. Template variables in file names enable dynamic file naming for time-based partitioning. + +### Templates + +The following template variables can be used in file names: + +|Variable|Description|Example| +|---|---|---| +|`{{.Year}}`|Current year|`2024`| +|`{{.Month}}`|Current month|`01`| +|`{{.Day}}`|Current day|`15`| +|`{{.Timestamp}}`|Current timestamp in nanoseconds|`1703688533123456789`| +|`{{.Format}}`|File format|`json`| +|`{{.Extension}}`|File extension|`json`| +|`{{.Compression}}`|Compression type|`zstd`| +|`{{.TargetName}}`|Target name|`my_logs`| +|`{{.TargetType}}`|Target type|`scaleways3`| +|`{{.Table}}`|Bucket name|`logs`| + +### Multipart Upload + +Large files automatically use multipart upload protocol with configurable part size (`part_size` parameter). Default 5MB part size balances upload efficiency and memory usage. + +### Multiple Buckets + +Single target can write to multiple Scaleway buckets with different configurations, enabling data distribution strategies (e.g., raw data to one bucket, processed data to another). + +### Schema Requirements + +Avro and Parquet formats require schema definition files. Schema files must be accessible at the path specified in the `schema` parameter during target initialization. + +### GDPR Compliance + +Scaleway's European infrastructure ensures full GDPR compliance with data residency guarantees. All data centers are located within the European Union, providing strong data sovereignty protections. + +### Storage Classes + +Scaleway Object Storage offers Standard and Glacier storage classes for different access patterns and retention requirements, enabling cost optimization. + +### Multi-Region Capabilities + +Deploy data across multiple European regions for enhanced availability and disaster recovery without leaving the European jurisdiction. + +## Examples + +### Basic Configuration + +The minimum configuration for a JSON Scaleway target: + +```yaml +targets: + - name: basic_scaleway + type: scaleways3 + properties: + key: "SCW1A2B3C4D5E6F7G8H9" + secret: "a1b2c3d4-e5f6-7890-a1b2-c3d4e5f6g7h8" + region: "fr-par" + endpoint: "https://s3.fr-par.scw.cloud" + bucket: "datastream-logs" +``` + +### Multiple Buckets + +Configuration for distributing data across multiple Scaleway buckets with different formats: + +```yaml +targets: + - name: multi_bucket_export + type: scaleways3 + properties: + key: "SCW1A2B3C4D5E6F7G8H9" + secret: "a1b2c3d4-e5f6-7890-a1b2-c3d4e5f6g7h8" + region: "nl-ams" + endpoint: "https://s3.nl-ams.scw.cloud" + buckets: + - bucket: "raw-data-archive" + name: "raw-{{.Year}}-{{.Month}}-{{.Day}}.json" + format: "multijson" + compression: "gzip" + - bucket: "analytics-data" + name: "analytics-{{.Year}}/{{.Month}}/{{.Day}}/data_{{.Timestamp}}.parquet" + format: "parquet" + schema: "" + compression: "snappy" +``` + +### Parquet Format + +Configuration for daily partitioned Parquet files: + +```yaml +targets: + - name: parquet_analytics + type: scaleways3 + properties: + key: "SCW1A2B3C4D5E6F7G8H9" + secret: "a1b2c3d4-e5f6-7890-a1b2-c3d4e5f6g7h8" + region: "pl-waw" + endpoint: "https://s3.pl-waw.scw.cloud" + bucket: "analytics-lake" + name: "events/year={{.Year}}/month={{.Month}}/day={{.Day}}/part-{{.Timestamp}}.parquet" + format: "parquet" + schema: "" + compression: "snappy" + max_size: 536870912 +``` + +### High Reliability + +Configuration with enhanced settings: + +```yaml +targets: + - name: reliable_scaleway + type: scaleways3 + pipelines: + - checkpoint + properties: + key: "SCW1A2B3C4D5E6F7G8H9" + secret: "a1b2c3d4-e5f6-7890-a1b2-c3d4e5f6g7h8" + region: "fr-par" + endpoint: "https://s3.fr-par.scw.cloud" + bucket: "critical-logs" + name: "logs-{{.Timestamp}}.json" + format: "json" + timeout: 60 + part_size: 10 +``` + +### With Field Normalization + +Using field normalization for standard format: + +```yaml +targets: + - name: normalized_scaleway + type: scaleways3 + properties: + key: "SCW1A2B3C4D5E6F7G8H9" + secret: "a1b2c3d4-e5f6-7890-a1b2-c3d4e5f6g7h8" + region: "nl-ams" + endpoint: "https://s3.nl-ams.scw.cloud" + bucket: "normalized-logs" + name: "logs-{{.Timestamp}}.json" + format: "json" + field_format: "cim" +``` + +### Debug Configuration + +Configuration with debugging enabled: + +```yaml +targets: + - name: debug_scaleway + type: scaleways3 + properties: + key: "SCW1A2B3C4D5E6F7G8H9" + secret: "a1b2c3d4-e5f6-7890-a1b2-c3d4e5f6g7h8" + region: "fr-par" + endpoint: "https://s3.fr-par.scw.cloud" + bucket: "test-logs" + name: "test-{{.Timestamp}}.json" + format: "json" + debug: + status: true + dont_send_logs: true +``` + +### GDPR-Compliant Archive + +Configuration optimized for GDPR-compliant long-term storage: + +```yaml +targets: + - name: gdpr_archive + type: scaleways3 + properties: + key: "SCW1A2B3C4D5E6F7G8H9" + secret: "a1b2c3d4-e5f6-7890-a1b2-c3d4e5f6g7h8" + region: "fr-par" + endpoint: "https://s3.fr-par.scw.cloud" + bucket: "compliance-archive" + name: "archive/{{.Year}}/{{.Month}}/logs-{{.Day}}.json" + format: "json" + compression: "zstd" + max_size: 1073741824 +``` \ No newline at end of file diff --git a/docs/configuration/targets/wasabi-cloud-storage.mdx b/docs/configuration/targets/wasabi-cloud-storage.mdx new file mode 100644 index 00000000..80340d05 --- /dev/null +++ b/docs/configuration/targets/wasabi-cloud-storage.mdx @@ -0,0 +1,382 @@ +# Wasabi Hot Cloud Storage + +WasabiLong Term Storage + +## Synopsis + +Creates a target that writes log messages to Wasabi Hot Cloud Storage with support for various file formats and authentication methods. The target handles large file uploads efficiently with configurable rotation based on size or event count. Wasabi provides high-performance, cost-effective cloud storage with no egress fees and predictable pricing. + +## Schema + +```yaml {1,3} +- name: + description: + type: wasabis3 + pipelines: + status: + properties: + key: + secret: + region: + endpoint: + part_size: + bucket: + buckets: + - bucket: + name: + format: + compression: + extension: + schema: + name: + format: + compression: + extension: + schema: + max_size: + batch_size: + timeout: + field_format: + interval: + cron: + debug: + status: + dont_send_logs: +``` + +## Configuration + +The following fields are used to define the target: + +|Field|Required|Default|Description| +|---|---|---|---| +|`name`|Y||Target name| +|`description`|N|-|Optional description| +|`type`|Y||Must be `wasabis3`| +|`pipelines`|N|-|Optional post-processor pipelines| +|`status`|N|`true`|Enable/disable the target| + +### Wasabi Hot Cloud Storage Credentials + +|Field|Required|Default|Description| +|---|---|---|---| +|`key`|Y|-|Wasabi access key ID| +|`secret`|Y|-|Wasabi secret access key| +|`region`|Y|-|Wasabi region (e.g., `us-east-1`, `eu-central-1`, `ap-northeast-1`)| +|`endpoint`|Y|-|Wasabi endpoint URL (format: `https://s3..wasabisys.com`)| + +### Connection + +|Field|Required|Default|Description| +|---|---|---|---| +|`part_size`|N|`5`|Multipart upload part size in megabytes (minimum 5MB)| +|`timeout`|N|`30`|Connection timeout in seconds| +|`field_format`|N|-|Data normalization format. See applicable Normalization section| + +### Files + +|Field|Required|Default|Description| +|---|---|---|---| +|`bucket`|N*|-|Default Wasabi bucket name (used if `buckets` not specified)| +|`buckets`|N*|-|Array of bucket configurations for file distribution| +|`buckets.bucket`|Y|-|Wasabi bucket name| +|`buckets.name`|Y|-|File name template| +|`buckets.format`|N|`"json"`|Output format: `json`, `multijson`, `avro`, `parquet`| +|`buckets.compression`|N|`"zstd"`|Compression algorithm| +|`buckets.extension`|N|Matches `format`|File extension override| +|`buckets.schema`|N*|-|Schema definition file path (required for Avro and Parquet formats)| +|`name`|N|`"vmetric.{{.Timestamp}}.{{.Extension}}"`|Default file name template when `buckets` not used| +|`format`|N|`"json"`|Default output format when `buckets` not used| +|`compression`|N|`"zstd"`|Default compression when `buckets` not used| +|`extension`|N|Matches `format`|Default file extension when `buckets` not used| +|`schema`|N|-|Default schema path when `buckets` not used| +|`max_size`|N|`0`|Maximum file size in bytes before rotation| +|`batch_size`|N|`100000`|Maximum number of messages per file| + +\* = Either `bucket` or `buckets` must be specified. When using `buckets`, schema is conditionally required for Avro and Parquet formats. + +:::note +When `max_size` is reached, the current file is uploaded to Wasabi and a new file is created. For unlimited file size, set the field to `0`. +::: + +### Scheduler + +|Field|Required|Default|Description| +|---|---|---|---| +|`interval`|N|realtime|Execution frequency. See Interval for details| +|`cron`|N|-|Cron expression for scheduled execution. See Cron for details| + +### Debug Options + +|Field|Required|Default|Description| +|---|---|---|---| +|`debug.status`|N|`false`|Enable debug logging| +|`debug.dont_send_logs`|N|`false`|Process logs but don't send to target (testing)| + +## Details + +The Wasabi Hot Cloud Storage target provides high-performance cloud storage integration with comprehensive file format support. Wasabi is designed as a cost-effective alternative to traditional cloud storage with 80% lower pricing, no egress fees, and no API charges. + +### Authentication + +Requires Wasabi access credentials. Access keys can be created through the Wasabi Console under Access Keys. Sub-users can be created with specific permissions and bucket access for enhanced security. + +### Endpoint Configuration + +The endpoint URL follows the pattern `https://s3..wasabisys.com` where `` is your chosen Wasabi region identifier. Each bucket is associated with a specific region. + +### Available Regions + +Wasabi Hot Cloud Storage is available in the following regions worldwide: + +|Region Code|Location| +|---|---| +|`us-east-1`|US East (N. Virginia)| +|`us-east-2`|US East (N. Virginia)| +|`us-central-1`|US Central (Texas)| +|`us-west-1`|US West (Oregon)| +|`eu-central-1`|Europe (Amsterdam)| +|`eu-central-2`|Europe (Frankfurt)| +|`eu-west-1`|Europe (London)| +|`eu-west-2`|Europe (Paris)| +|`ap-northeast-1`|Asia Pacific (Tokyo)| +|`ap-northeast-2`|Asia Pacific (Osaka)| +|`ap-southeast-1`|Asia Pacific (Singapore)| +|`ap-southeast-2`|Asia Pacific (Sydney)| +|`ca-central-1`|Canada (Toronto)| + +### File Formats + +|Format|Description| +|---|---| +|`json`|Each log entry is written as a separate JSON line (JSONL format)| +|`multijson`|All log entries are written as a single JSON array| +|`avro`|Apache Avro format with schema| +|`parquet`|Apache Parquet columnar format with schema| + +### Compression + +All formats support optional compression to reduce storage costs and transfer times. Compression is applied before upload. + +|Format|Compression Options| +|---|---| +|JSON/MultiJSON|`zstd` (default), `gzip`| +|Avro|`null`, `deflate`, `snappy`, `zstd`| +|Parquet|`uncompressed`, `gzip`, `snappy`, `zstd`, `brotli`, `lz4`| + +### File Management + +Files are rotated based on size (`max_size` parameter) or event count (`batch_size` parameter), whichever limit is reached first. Template variables in file names enable dynamic file naming for time-based partitioning. + +### Templates + +The following template variables can be used in file names: + +|Variable|Description|Example| +|---|---|---| +|`{{.Year}}`|Current year|`2024`| +|`{{.Month}}`|Current month|`01`| +|`{{.Day}}`|Current day|`15`| +|`{{.Timestamp}}`|Current timestamp in nanoseconds|`1703688533123456789`| +|`{{.Format}}`|File format|`json`| +|`{{.Extension}}`|File extension|`json`| +|`{{.Compression}}`|Compression type|`zstd`| +|`{{.TargetName}}`|Target name|`my_logs`| +|`{{.TargetType}}`|Target type|`wasabis3`| +|`{{.Table}}`|Bucket name|`logs`| + +### Multipart Upload + +Large files automatically use multipart upload protocol with configurable part size (`part_size` parameter). Default 5MB part size balances upload efficiency and memory usage. + +### Multiple Buckets + +Single target can write to multiple Wasabi buckets with different configurations, enabling data distribution strategies (e.g., raw data to one bucket, processed data to another). + +### Schema Requirements + +Avro and Parquet formats require schema definition files. Schema files must be accessible at the path specified in the `schema` parameter during target initialization. + +### Cost Advantages + +Wasabi offers significant cost savings compared to AWS S3 and other major cloud providers. Key pricing benefits include no egress fees, no API request charges, and storage costs up to 80% lower than AWS S3. + +### Performance Characteristics + +Wasabi delivers consistent, high-speed performance across all storage tiers with no performance degradation. All data is stored on high-performance infrastructure without cold storage tiers. + +### Data Durability + +Wasabi provides 11 nines (99.999999999%) of object durability through erasure coding and geographic distribution of data across multiple data centers. + +### Immutability Support + +Wasabi supports bucket-level and object-level immutability for compliance requirements, enabling write-once-read-many (WORM) storage configurations. + +## Examples + +### Basic Configuration + +The minimum configuration for a JSON Wasabi target: + +```yaml +targets: + - name: basic_wasabi + type: wasabis3 + properties: + key: "ABCDEFGHIJKLMNOPQRST" + secret: "abcdefghijklmnopqrstuvwxyz0123456789ABCD" + region: "us-east-1" + endpoint: "https://s3.us-east-1.wasabisys.com" + bucket: "datastream-logs" +``` + +### Multiple Buckets + +Configuration for distributing data across multiple Wasabi buckets with different formats: + +```yaml +targets: + - name: multi_bucket_export + type: wasabis3 + properties: + key: "ABCDEFGHIJKLMNOPQRST" + secret: "abcdefghijklmnopqrstuvwxyz0123456789ABCD" + region: "eu-central-1" + endpoint: "https://s3.eu-central-1.wasabisys.com" + buckets: + - bucket: "raw-data-archive" + name: "raw-{{.Year}}-{{.Month}}-{{.Day}}.json" + format: "multijson" + compression: "gzip" + - bucket: "analytics-data" + name: "analytics-{{.Year}}/{{.Month}}/{{.Day}}/data_{{.Timestamp}}.parquet" + format: "parquet" + schema: "" + compression: "snappy" +``` + +### Parquet Format + +Configuration for daily partitioned Parquet files: + +```yaml +targets: + - name: parquet_analytics + type: wasabis3 + properties: + key: "ABCDEFGHIJKLMNOPQRST" + secret: "abcdefghijklmnopqrstuvwxyz0123456789ABCD" + region: "ap-northeast-1" + endpoint: "https://s3.ap-northeast-1.wasabisys.com" + bucket: "analytics-lake" + name: "events/year={{.Year}}/month={{.Month}}/day={{.Day}}/part-{{.Timestamp}}.parquet" + format: "parquet" + schema: "" + compression: "snappy" + max_size: 536870912 +``` + +### High Reliability + +Configuration with enhanced settings: + +```yaml +targets: + - name: reliable_wasabi + type: wasabis3 + pipelines: + - checkpoint + properties: + key: "ABCDEFGHIJKLMNOPQRST" + secret: "abcdefghijklmnopqrstuvwxyz0123456789ABCD" + region: "us-west-1" + endpoint: "https://s3.us-west-1.wasabisys.com" + bucket: "critical-logs" + name: "logs-{{.Timestamp}}.json" + format: "json" + timeout: 60 + part_size: 10 +``` + +### With Field Normalization + +Using field normalization for standard format: + +```yaml +targets: + - name: normalized_wasabi + type: wasabis3 + properties: + key: "ABCDEFGHIJKLMNOPQRST" + secret: "abcdefghijklmnopqrstuvwxyz0123456789ABCD" + region: "ap-southeast-1" + endpoint: "https://s3.ap-southeast-1.wasabisys.com" + bucket: "normalized-logs" + name: "logs-{{.Timestamp}}.json" + format: "json" + field_format: "cim" +``` + +### Debug Configuration + +Configuration with debugging enabled: + +```yaml +targets: + - name: debug_wasabi + type: wasabis3 + properties: + key: "ABCDEFGHIJKLMNOPQRST" + secret: "abcdefghijklmnopqrstuvwxyz0123456789ABCD" + region: "us-east-2" + endpoint: "https://s3.us-east-2.wasabisys.com" + bucket: "test-logs" + name: "test-{{.Timestamp}}.json" + format: "json" + debug: + status: true + dont_send_logs: true +``` + +### Cost-Optimized Archive + +Configuration optimized for long-term storage with maximum cost efficiency: + +```yaml +targets: + - name: archive_wasabi + type: wasabis3 + properties: + key: "ABCDEFGHIJKLMNOPQRST" + secret: "abcdefghijklmnopqrstuvwxyz0123456789ABCD" + region: "eu-west-1" + endpoint: "https://s3.eu-west-1.wasabisys.com" + bucket: "log-archive" + name: "archive/{{.Year}}/{{.Month}}/logs-{{.Day}}.json" + format: "json" + compression: "zstd" + max_size: 1073741824 +``` + +### High-Volume Data Lake + +Configuration for high-volume analytics data lake: + +```yaml +targets: + - name: data_lake_wasabi + type: wasabis3 + properties: + key: "ABCDEFGHIJKLMNOPQRST" + secret: "abcdefghijklmnopqrstuvwxyz0123456789ABCD" + region: "us-central-1" + endpoint: "https://s3.us-central-1.wasabisys.com" + bucket: "enterprise-datalake" + name: "data/year={{.Year}}/month={{.Month}}/day={{.Day}}/hour={{.Hour}}/{{.Timestamp}}.parquet" + format: "parquet" + schema: "" + compression: "snappy" + batch_size: 500000 + max_size: 2147483648 +``` \ No newline at end of file diff --git a/sidebars.ts b/sidebars.ts index b469246d..f5e330e2 100644 --- a/sidebars.ts +++ b/sidebars.ts @@ -115,21 +115,30 @@ const sidebars: SidebarsConfig = { label: "Targets", items: [ "configuration/targets/overview", + "configuration/targets/alibaba-oss", "configuration/targets/aws-s3", "configuration/targets/aws-security-lake", "configuration/targets/azure-blob-storage", "configuration/targets/azure-data-explorer", + "configuration/targets/backblaze-b2", "configuration/targets/bigquery", "configuration/targets/clickhouse", + "configuration/targets/cloudflare-r2", "configuration/targets/console", + "configuration/targets/digitalocean-spaces", "configuration/targets/discard", "configuration/targets/elasticsearch", "configuration/targets/event-hubs", "configuration/targets/file", + "configuration/targets/ibm-cos", "configuration/targets/microsoft-sentinel", "configuration/targets/microsoft-sentinel-data-lake", + "configuration/targets/minio", + "configuration/targets/oracle-cloud-os", + "configuration/targets/scaleway-os", "configuration/targets/splunk-hec", "configuration/targets/syslog", + "configuration/targets/wasabi-cloud-storage", ], }, { From 27cd5eb8e320b9698ad002b5c1047b6792bd0463 Mon Sep 17 00:00:00 2001 From: Yusuf Ozturk Date: Sun, 26 Oct 2025 11:27:25 +0100 Subject: [PATCH 11/13] Add OCSF as a new field type --- docs/appendix/field-formats/ocsf.mdx | 132 +++++++++++++++++++++++++++ sidebars.ts | 1 + topics.json | 1 + 3 files changed, 134 insertions(+) create mode 100644 docs/appendix/field-formats/ocsf.mdx diff --git a/docs/appendix/field-formats/ocsf.mdx b/docs/appendix/field-formats/ocsf.mdx new file mode 100644 index 00000000..d80e5069 --- /dev/null +++ b/docs/appendix/field-formats/ocsf.mdx @@ -0,0 +1,132 @@ +--- +pagination_prev: null +pagination_next: null +--- + +# OCSF + +The Open Cybersecurity Schema Framework (OCSF) is an open standard for security event data that provides a vendor-agnostic way to normalize security logs across different sources. OCSF provides standardization for security-focused log data, enabling seamless integration with AWS Security Lake and other security analytics platforms. + +OCSF organizes security events into classes, each representing a specific type of security activity. When using `field_format: "ocsf"`, VirtualMetric automatically transforms your security data into OCSF-compliant format based on the event type. + +## Available OCSF Schema Classes + +### System Activity (1000-1999) + +- `OCSF1001` - File Activity +- `OCSF1002` - Kernel Extension Activity +- `OCSF1003` - Kernel Activity +- `OCSF1004` - Memory Activity +- `OCSF1005` - Module Activity +- `OCSF1006` - Scheduled Job Activity +- `OCSF1007` - Process Activity + +### Findings (2000-2999) + +- `OCSF2001` - Security Finding +- `OCSF2002` - Vulnerability Finding +- `OCSF2003` - Compliance Finding +- `OCSF2004` - Detection Finding + +### Identity & Access Management (3000-3999) + +- `OCSF3001` - Account Change +- `OCSF3002` - Authentication +- `OCSF3003` - Authorize Session +- `OCSF3004` - Entity Management +- `OCSF3005` - User Access Management +- `OCSF3006` - Group Management + +### Network Activity (4000-4999) + +- `OCSF4001` - Network Activity +- `OCSF4002` - HTTP Activity +- `OCSF4003` - DNS Activity +- `OCSF4004` - DHCP Activity +- `OCSF4005` - RDP Activity +- `OCSF4006` - SMB Activity +- `OCSF4007` - SSH Activity +- `OCSF4008` - FTP Activity +- `OCSF4009` - Email Activity +- `OCSF4010` - Network File Activity +- `OCSF4011` - Email File Activity +- `OCSF4012` - Email URL Activity +- `OCSF4013` - NTP Activity +- `OCSF4014` - Tunnel Activity + +### Discovery (5000-5999) + +- `OCSF5001` - Device Inventory Info +- `OCSF5002` - Device Config State +- `OCSF5003` - User Inventory Info +- `OCSF5004` - Operating System Patch State + +### Application Activity (6000-6999) + +- `OCSF6001` - Web Resources Activity +- `OCSF6002` - Application Lifecycle +- `OCSF6003` - API Activity +- `OCSF6004` - Web Resource Access Activity +- `OCSF6005` - Datastore Activity +- `OCSF6006` - File Hosting Activity + +## Usage + +To enable OCSF normalization, specify the field format in your target configuration: + +```yaml +targets: + - name: my_target + type: awssecuritylake + properties: + field_format: "ocsf" +``` + +When using the VirtualMetric AWS Security Lake Pack, OCSF normalization is handled automatically through the `aws_lake` pipeline. The pack intelligently routes events to the appropriate OCSF schema class based on the source data type and vendor. + +## Integration with AWS Security Lake + +OCSF is the native schema format for AWS Security Lake. When sending data to AWS Security Lake, you must: + +1. Enable OCSF field formatting (handled automatically by the `aws_lake` pipeline) +2. Specify the appropriate OCSF schema identifier for each bucket +3. Ensure data is in Parquet format (handled automatically by the `awssecuritylake` target) + +## VirtualMetric AWS Security Lake Pack + +The VirtualMetric AWS Security Lake Pack provides comprehensive OCSF normalization for diverse security data sources: + +- **Syslog messages** (native, CEF, LEEF formats) are automatically converted to OCSF +- **Windows Security Events** are transformed from ECS through ASIM to OCSF +- **Firewall logs** from major vendors (Fortinet, Palo Alto Networks, Check Point, Cisco ASA, SonicWall, WatchGuard, Cisco Meraki) are normalized to OCSF +- **Windows DNS logs** are converted to OCSF DNS Activity format + +The pack handles multi-stage transformations, preserving vendor-specific context while ensuring OCSF compliance for AWS Security Lake ingestion. + +## Example Configuration + +```yaml +targets: + - name: security_lake + type: awssecuritylake + pipelines: + - aws_lake + properties: + key: "AKIAIOSFODNN7EXAMPLE" + secret: "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY" + region: "us-east-1" + source: "virtualmetric" + account: "123456789012" + buckets: + - bucket: "aws-security-data-lake-network" + name: "network-{{.Timestamp}}.parquet" + schema: "OCSF4001" + - bucket: "aws-security-data-lake-auth" + name: "auth-{{.Timestamp}}.parquet" + schema: "OCSF3002" + - bucket: "aws-security-data-lake-dns" + name: "dns-{{.Timestamp}}.parquet" + schema: "OCSF4003" +``` + +In this example, the `aws_lake` pipeline automatically normalizes all events to OCSF format, and the target routes them to the appropriate buckets based on their schema class. \ No newline at end of file diff --git a/sidebars.ts b/sidebars.ts index f5e330e2..e56371d4 100644 --- a/sidebars.ts +++ b/sidebars.ts @@ -384,6 +384,7 @@ const sidebars: SidebarsConfig = { "appendix/field-formats/csl", "appendix/field-formats/ecs", "appendix/field-formats/leef", + "appendix/field-formats/ocsf", ], }, { diff --git a/topics.json b/topics.json index ab17922d..ae58bd5c 100644 --- a/topics.json +++ b/topics.json @@ -52,6 +52,7 @@ "appendix-cim": "/appendix/field-formats/cim", "appendix-ecs": "/appendix/field-formats/ecs", "appendix-leef": "/appendix/field-formats/leef", + "appendix-ocsf": "/appendix/field-formats/ocsf", "appendix-estreamer": "/appendix/protocols/estreamer", "appendix-netflow": "/appendix/protocols/netflow", "appendix-ipfix": "/appendix/protocols/ipfix", From e4fea585be664122ee028fcfe7a88ba0ac53b4e0 Mon Sep 17 00:00:00 2001 From: Yusuf Ozturk Date: Sun, 26 Oct 2025 13:14:46 +0100 Subject: [PATCH 12/13] Add siem optimization --- docs/about/siem-optimization.mdx | 580 +++++++++++++++++++++++++++++++ sidebars.ts | 1 + 2 files changed, 581 insertions(+) create mode 100644 docs/about/siem-optimization.mdx diff --git a/docs/about/siem-optimization.mdx b/docs/about/siem-optimization.mdx new file mode 100644 index 00000000..8a45c800 --- /dev/null +++ b/docs/about/siem-optimization.mdx @@ -0,0 +1,580 @@ +--- +sidebar_label: SIEM Optimization +--- + +# SIEM Optimization + +**VirtualMetric DataStream** provides comprehensive data optimization capabilities that significantly reduce storage costs and improve query performance across multiple security platforms including _Microsoft Sentinel_, _AWS Security Lake_, _Elasticsearch_, _Splunk Enterprise Security_, and _Google SecOps_. Through intelligent field-level optimization and optional event filtering, organizations can achieve 55-60% data reduction while preserving all security-critical information required for detection and response operations. + +## Risk-Free Reduction Framework + +**DataStream**'s Risk-Free Reduction represents a fundamentally different approach to data optimization compared to traditional telemetry pipelines. While most solutions focus on dropping entire log lines, **DataStream** focuses on removing garbage from log content, eliminating unnecessary fields while preserving complete security context. This field-level approach achieves substantial data reduction without compromising detection capabilities. + +The framework is built on extensive analysis of Microsoft Sentinel content, including analytic queries, ASIM parsers, detection rules, and workbooks. For each supported vendor, VirtualMetric engineers analyze which fields are actively used by security operations and which fields contain only operational metadata or placeholder values. This analysis has been validated by external third-party security experts, confirming that only truly unnecessary data is removed. + +```mermaid +graph TD + Analysis[Microsoft Sentinel Content Analysis] --> Queries([Analytic Queries]) + Analysis --> Parsers([ASIM Parsers]) + Analysis --> Rules([Detection Rules]) + Analysis --> Workbooks([Workbooks]) + + Queries --> Mapping([Field Usage Mapping]) + Parsers --> Mapping + Rules --> Mapping + Workbooks --> Mapping + + Mapping --> Critical[Security-Critical Fields] + Mapping --> Unused[Unused Fields] + + Critical --> Preserve([Preserve in All Vendors]) + Unused --> Remove([Safe to Remove]) + + Remove --> Validation([3rd Party Validation]) + Validation --> Pack([Vendor Optimization Pack]) + + style Analysis fill:#BCC0E7 + style Mapping fill:#E5E2FB + style Pack fill:#E5E2FB +``` + +This methodology ensures zero security risk because optimization decisions are based on actual usage patterns in production security operations, not assumptions or heuristics. When Microsoft Sentinel parsers require a field for normalization or analytic rules reference a field for detection, that field is preserved regardless of its content. + +Key principles include: + +* **Field-level optimization** - removes unnecessary fields, not entire events +* **Content-based analysis** - decisions based on Microsoft Sentinel production usage +* **Third-party validation** - external experts verify security integrity +* **Vendor-specific intelligence** - unique optimization for each vendor's log format +* **Preservation guarantees** - all detection-relevant fields always retained +* **No AI/ML involvement** - deterministic, predictable optimization behavior + +## Why VirtualMetric's Approach is Superior + +**DataStream** deliberately avoids AI-based optimization techniques that other vendors promote, recognizing the fundamental incompatibility between AI unpredictability and enterprise security requirements. AI models can produce unexpected results, potentially dropping critical security events without warning. This unpredictability is unacceptable in security operations where a single missed alert could represent a major breach. + +AI-based approaches introduce multiple risks that VirtualMetric's deterministic framework eliminates. AI models require training on actual log data, creating privacy and compliance concerns as sensitive security information may be learned by the model. AI processing adds significant latency and computational cost, reducing throughput and increasing infrastructure requirements. Most critically, AI decisions cannot be audited or validated, making it impossible to verify that security-relevant data is preserved. + +```mermaid +graph TD + subgraph AI[AI-Based Optimization Risks] + R1[Unpredictable Results] + R2[May Drop Critical Events] + R3[Privacy Concerns] + R4[Training on Sensitive Data] + R5[Processing Latency] + R6[Increased Costs] + R7[Non-Auditable Decisions] + end + + subgraph VM[VirtualMetric Approach] + V1[Deterministic Rules] + V2[Guaranteed Field Preservation] + V3[No Data Learning] + V4[High Performance] + V5[Cost Efficient] + V6[Fully Auditable] + V7[Expert Validated] + end + + AI -.->|Risk| Enterprise[Enterprise Security] + VM -.->|Safe| Enterprise + + style AI fill:#FFE5E5 + style VM fill:#BCC0E7 + style Enterprise fill:#E5E2FB +``` + +**DataStream**'s expert-driven approach provides predictable, consistent results that security teams can trust. Every optimization decision is based on analysis of real-world security operations, validated by experts, and documented for audit purposes. Organizations can confidently deploy aggressive optimization knowing that detection capabilities remain intact. + +Advantages over AI-based optimization include: + +* **Predictable behavior** - same input always produces same output +* **Zero risk of dropping critical events** - preservation rules are absolute +* **No privacy concerns** - no learning from customer data +* **Maximum performance** - no AI processing overhead +* **Lower costs** - efficient rule-based processing +* **Complete auditability** - every decision can be traced and validated +* **Enterprise trust** - deterministic systems meet compliance requirements + +## Unified Optimization Strategy + +**DataStream** employs a smart, centralized optimization strategy that dramatically simplifies management across multiple SIEM platforms. Rather than maintaining separate optimization logic for each target platform, the system applies vendor-specific optimization based on Microsoft Sentinel content analysis, then transforms the optimized data to target schemas in post-processing pipelines. + +This approach means administrators configure optimization rules once per vendor, not once per vendor per SIEM platform. A single Fortinet optimization pack automatically reduces data volume for Sentinel, Splunk, Elasticsearch, and all other configured destinations. Changes to vendor-specific filtering rules immediately apply across the entire multi-platform deployment. + +```mermaid +graph TD + Vendor[Vendor Logs] --> Pack([Vendor Optimization Pack]) + + Pack --> Optimized[Optimized Data] + + Optimized --> Schema([Multi-Schema Transform]) + + Schema --> ASIM[ASIM - Microsoft Sentinel] + Schema --> OCSF[OCSF - AWS Security Lake] + Schema --> ECS[ECS - Elasticsearch] + Schema --> CIM[CIM - Splunk] + Schema --> UDM[UDM - Google SecOps] + + style Pack fill:#BCC0E7 + style Schema fill:#E5E2FB +``` + +This unified strategy provides significant operational advantages. Security teams maintain a single set of optimization rules regardless of how many SIEM platforms they use. Testing and validation happens once, not repeatedly for each destination. Knowledge gained from Microsoft Sentinel content analysis automatically benefits all target platforms. + +The approach works because security-relevant fields are consistent across platforms. A field that contains critical detection data for Microsoft Sentinel also contains critical data for Splunk or Elasticsearch. By optimizing based on Microsoft Sentinel's comprehensive parser and detection rule ecosystem, **DataStream** ensures security integrity across all platforms. + +Benefits include: + +* **Single configuration point** - one vendor pack optimizes for all destinations +* **Simplified management** - no per-platform optimization rules needed +* **Consistent behavior** - same optimization across all SIEM platforms +* **Easier validation** - test once, deploy everywhere +* **Reduced complexity** - fewer configuration files to maintain +* **Faster deployment** - single change affects all platforms +* **Knowledge leverage** - Microsoft Sentinel analysis benefits all destinations + +## Vendor-Specific Optimization Packs + +**DataStream** includes pre-built optimization packs for major security vendors, each developed through detailed analysis of Microsoft Sentinel parsers, analytic queries, and detection rules. These packs understand the specific log formats and field structures for each vendor, applying precise field-level optimization while guaranteeing preservation of security-relevant data. + +Each vendor pack identifies which fields are actively used in security operations and which fields consistently contain placeholder values, operational metadata, or redundant information. The packs parse complex extension fields, remove unnecessary attributes, and reconstruct only the meaningful portions of each log entry. + +```mermaid +graph LR + Logs[Vendor Logs] + + subgraph Packs[Vendor Optimization Packs] + FN[Fortinet] + PA[Palo Alto] + CP[Check Point] + CS[Cisco] + ZS[Zscaler] + CT[Citrix] + FP[Forcepoint] + F5[F5 BigIP] + SW[SonicWall] + BC[Barracuda] + IB[Infoblox] + WG[WatchGuard] + NZ[Nozomi] + AK[Akamai] + EH[ExtraHop] + DT[Darktrace] + CA[CyberArk] + VC[Vectra] + CR[CrowdStrike] + SM[Symantec] + SO[Sophos] + JN[Juniper] + AR[Aruba] + S1[SentinelOne] + end + + Logs --> Packs + Packs --> Optimized[Field-Optimized Data] + + style Packs fill:#BCC0E7 + style Optimized fill:#E5E2FB +``` + +The vendor pack library is continuously expanding and includes optimization for leading security solutions across firewalls, proxies, endpoint protection, network detection and response, privileged access management, and cloud security platforms. + +Supported vendor optimization packs include: + +* **Network Security** - Fortinet FortiGate, Palo Alto Networks, Check Point, Cisco ASA, SonicWall, Barracuda WAF, WatchGuard, Juniper SRX +* **Secure Web Gateway** - Zscaler, Citrix NetScaler, Forcepoint +* **Application Delivery** - F5 BigIP, Citrix ADC +* **DNS Security** - Infoblox +* **Network Detection & Response** - Nozomi Networks, ExtraHop RevealX, Darktrace, Vectra +* **Cloud Security** - Akamai Edge Platform +* **Privileged Access** - CyberArk +* **Endpoint Protection** - CrowdStrike Falcon, Symantec Endpoint Protection, Sophos XG, SentinelOne +* **Network Access Control** - Aruba ClearPass + +Each pack automatically activates when logs from the corresponding vendor are detected, requiring no manual configuration. + +## Intelligent Field Optimization + +The core of **DataStream**'s Risk-Free Reduction is intelligent field-level optimization that removes garbage from log content without eliminating security context. The **Compact Processor** automatically removes fields that provide no security value, including empty fields, null values, and common placeholder patterns found across different security vendors. + +The processor recognizes standard placeholder values including numeric zeros, string placeholders, undefined values, and various representations of "no data available." By analyzing Microsoft Sentinel parsers and detection rules, VirtualMetric engineers identified which fields are never referenced in security operations, allowing safe removal even when they contain data. + +```mermaid +graph TD + Data[Security Event] --> Analysis([Field Analysis]) + + Analysis --> Used{Used by Sentinel?} + + Used -->|Yes| Preserve[Preserve Field] + Used -->|No| Check{Has Value?} + + Check -->|Placeholder| Remove[Remove Field] + Check -->|Empty| Remove + Check -->|Null| Remove + Check -->|Real Value| Evaluate{Security Value?} + + Evaluate -->|None| Remove + Evaluate -->|Potential| Preserve + + Preserve --> Output[Optimized Event] + Remove --> Output + + style Analysis fill:#BCC0E7 + style Output fill:#E5E2FB +``` + +The processor supports configurable exclusion lists to preserve specific fields even when they contain placeholder values. This is essential for fields like severity levels or operation codes where a zero value carries semantic meaning and is referenced in detection logic. + +Key capabilities include: + +* **Microsoft Sentinel usage analysis** - preserves fields used in parsers and queries +* **Automatic placeholder detection** - recognizes vendor-specific null patterns +* **Configurable value patterns** - "0", "undefined", "0x0", "-", "N/A" and custom patterns +* **Field exclusion support** - protects fields where placeholders have meaning +* **Extension field processing** - parses and optimizes CEF/LEEF additional extensions +* **XML optimization** - processes Windows Event Log EventData efficiently +* **Recursive cleanup** - handles nested objects and arrays + +## Optional Event-Level Filtering + +Beyond field-level optimization, **DataStream** provides optional event-level filtering that removes entire log entries based on industry best practices and expert knowledge. These filters are **disabled by default** to ensure conservative, risk-free operation, but can be enabled when organizations want more aggressive data reduction. + +Event filters are developed based on deep vendor knowledge and real-world security operations experience. VirtualMetric engineers identify specific log types, event IDs, and traffic patterns that generate high volumes but rarely contain security-relevant information. These patterns are documented and validated before inclusion in vendor packs. + +```mermaid +graph TD + Event[Security Event] --> FieldOpt([Field Optimization - Always On]) + + FieldOpt --> Optimized[Field-Optimized Event] + + Optimized --> EventFilter{Event Filters Enabled?} + + EventFilter -->|No - Default| Output[To SIEM] + EventFilter -->|Yes - Optional| Analysis([Pattern Analysis]) + + Analysis --> Type{Event Type} + + Type -->|Private-to-Private| Drop[Drop Event] + Type -->|IPv6 Local| Drop + Type -->|Reserved Country| Drop + Type -->|Security Relevant| Output + + style FieldOpt fill:#BCC0E7 + style Analysis fill:#E5E2FB + style Output fill:#E5E2FB +``` + +Common event filtering patterns include: + +* **Private network traffic** - communications between internal private IP addresses +* **IPv6 local traffic** - link-local (fe80::) and unique local (fc00::) addresses +* **Reserved geographic regions** - traffic from unassigned country codes +* **Accepted outbound connections** - permitted traffic from internal to external +* **Specific event IDs** - vendor-specific operational events with no security value + +Organizations enable event filtering after reviewing their specific environment and security requirements, understanding that aggressive filtering provides maximum cost savings while field-level optimization alone delivers substantial reduction with zero risk. + +## Statistical Sampling + +For organizations requiring even more aggressive data reduction, **DataStream** provides configurable statistical sampling that retains only a percentage of events matching specific criteria. Sampling is always selective, never applied to security-critical events, and users configure exactly which event types should be sampled at which rates. + +The sampling engine allows different rates for different event patterns. High-volume operational traffic might be sampled at 1-in-10 while verbose debug logs are sampled at 1-in-100. Security alerts, authentication failures, and other critical events are never sampled, ensuring complete visibility into actual security incidents. + +```mermaid +graph TD + Stream[Event Stream] --> Classify([Event Classification]) + + Classify --> Priority{Event Category} + + Priority -->|Security Critical| Full[100% Retention] + Priority -->|Operational - High Volume| Sample1([1-in-10 Sampling]) + Priority -->|Operational - Very High Volume| Sample2([1-in-100 Sampling]) + + Sample1 --> Output[To SIEM] + Sample2 --> Output + Full --> Output + + style Classify fill:#BCC0E7 + style Output fill:#E5E2FB +``` + +Sampling capabilities include: + +* **Rule-based sampling** - different rates for different event patterns +* **Vendor-specific rules** - sampling patterns tuned per vendor +* **Configurable rates** - precise control over retention percentages +* **Security event protection** - critical events never sampled +* **Statistical validity** - maintains representative distributions +* **Deterministic behavior** - consistent, predictable sampling + +## Dynamic Sampling + +Beyond static sampling rates, **DataStream** supports dynamic sampling that adjusts retention rates based on current data volumes and system conditions. This advanced capability prevents data loss during unusual activity while maintaining aggressive reduction during normal operations. + +Dynamic sampling monitors incoming data rates and automatically reduces sampling when volumes drop or increase retention when volumes spike. This ensures that unusual patterns, which often indicate security events, receive higher retention while routine operational traffic is aggressively reduced. + +Key features include: + +* **Volume-based adjustment** - responds to traffic pattern changes +* **Anomaly detection** - increases retention during unusual activity +* **Automatic rate tuning** - optimizes sampling without manual intervention +* **Threshold configuration** - defines volume levels triggering adjustments +* **Real-time response** - immediate adaptation to changing conditions + +## Aggregation + +For use cases where real-time delivery is not required, **DataStream** provides aggregation capabilities that combine similar events into summarized records, achieving additional data reduction. Aggregation operates on configurable time intervals, such as 1 minute or 5 minutes, grouping events by key attributes and producing statistical summaries. + +Aggregation is particularly valuable for high-volume metrics, performance data, and operational telemetry where individual events provide less value than aggregate statistics. Organizations configure which event types to aggregate, which fields to group by, and what statistics to calculate. + +```mermaid +graph TD + Events[Event Stream] --> Window([Time Window - 1/5 min]) + + Window --> Group([Group by Attributes]) + + Group --> Calc([Calculate Statistics]) + + Calc --> Summary[Aggregated Summary] + + Summary --> Output[To SIEM] + + style Window fill:#BCC0E7 + style Summary fill:#E5E2FB + style Output fill:#E5E2FB +``` + +Aggregation capabilities include: + +* **Time-based windowing** - configurable aggregation intervals +* **Multi-field grouping** - combine events by multiple attributes +* **Statistical functions** - count, sum, average, min, max, percentiles +* **Selective aggregation** - only specified event types aggregated +* **Metadata preservation** - maintains security context in summaries + +Note that aggregation introduces latency equal to the aggregation window, making it unsuitable for real-time security monitoring. Organizations typically use aggregation for operational metrics and performance data while sending security events in real-time. + +## Correlation ID and Archive Integration + +**DataStream** provides a sophisticated correlation ID system that enables cost-effective long-term storage while maintaining the ability to retrieve complete original logs when needed. The system appends a unique correlation ID to each event before optimization, creating a permanent link between the optimized data in active SIEM platforms and complete raw data in archival storage. + +This architecture allows organizations to send full, unoptimized logs to low-cost storage tiers like Azure Blob Storage, AWS S3, Azure Data Explorer, Google BigQuery, or Microsoft Sentinel data lake, while sending optimized, field-reduced logs to expensive active SIEM platforms. Security analysts work with optimized data for day-to-day operations but can retrieve complete original logs for forensic investigations using the correlation ID. + +```mermaid +graph TD + Original[Original Log] --> ID([Append Correlation ID]) + + ID --> Split{Data Path} + + Split -->|Full Raw Data| Archive[Archival Storage] + Split -->|Optimized Data| SIEM[Active SIEM] + + Archive --> Blob[Azure Blob / AWS S3] + Archive --> Lake[Sentinel Data Lake] + Archive --> ADX[Azure Data Explorer] + Archive --> BQ[Google BigQuery] + + SIEM --> Sentinel[Microsoft Sentinel] + SIEM --> Splunk[Splunk] + SIEM --> Elastic[Elasticsearch] + + Sentinel -.->|KQL Join| Blob + Sentinel -.->|KQL Join| Lake + Sentinel -.->|KQL Join| ADX + + style ID fill:#BCC0E7 + style Archive fill:#E5E2FB + style SIEM fill:#E5E2FB +``` + +This approach is particularly powerful with Microsoft Sentinel, where KQL supports joining data across multiple sources including Sentinel workspaces, Azure Data Explorer, Sentinel data lake, and Azure Blob Storage. Analysts can query optimized data for fast, cost-effective operations, then seamlessly retrieve complete original logs when investigation requires full context. + +The correlation ID system enables: + +* **Dual-tier storage** - active SIEM for optimized data, archive for complete logs +* **Cost optimization** - expensive platforms store only reduced data +* **Complete forensics** - full original logs always available via correlation ID +* **Cross-platform joins** - KQL queries span multiple storage systems +* **Audit compliance** - complete logs preserved for regulatory requirements +* **Investigation flexibility** - analysts choose appropriate level of detail + +The correlation ID is implemented as a unique identifier appended to each event during initial processing. This ID remains consistent across all destinations, whether the event is sent to Sentinel, ADX, Blob Storage, or multiple platforms simultaneously. When analysts identify events of interest in optimized Sentinel data, they use the correlation ID to retrieve corresponding full records from archival storage. + +## Windows Event Log Optimization + +Windows Security Event logs represent one of the highest volume data sources in enterprise environments. **DataStream** provides specialized optimization for Windows events that can reduce their size by 60-70% through intelligent EventData field processing while maintaining complete security visibility. + +Windows events include a complex XML EventData field containing dozens of attributes, many of which contain placeholder values or operational metadata not used in security detection. VirtualMetric's analysis of Microsoft Sentinel Windows parsers and detection rules identified which EventData attributes are security-relevant and which can be safely removed. + +```mermaid +graph TD + WinLog[Windows Security Event] --> Parse([XML Parser]) + + Parse --> EventData[EventData Field] + + EventData --> Sentinel([Sentinel Parser Analysis]) + + Sentinel --> Used{Used in Detections?} + + Used -->|Yes| Keep[Preserve Attribute] + Used -->|No| Check{Has Value?} + + Check -->|Placeholder| Remove[Remove Attribute] + Check -->|Empty| Remove + Check -->|Meaningful| Keep + + Keep --> Rebuild([Reconstruct XML]) + Remove --> Rebuild + + Rebuild --> Optimized[Optimized Event] + + style Parse fill:#BCC0E7 + style Sentinel fill:#E5E2FB + style Optimized fill:#E5E2FB +``` + +The system parses the EventData XML, analyzes each attribute against Microsoft Sentinel usage patterns, removes unnecessary attributes and placeholders, and reconstructs a minimal XML structure containing only security-relevant data. This selective processing dramatically reduces storage requirements while preserving all information used by detection rules and ASIM parsers. + +Windows-specific optimizations include: + +* **EventData XML parsing** - efficient processing of complex event structures +* **Sentinel parser validation** - preserves fields used in ASIM normalization +* **Default GUID removal** - strips placeholder GUIDs like 00000000-0000-0000-0000-000000000000 +* **Empty attribute removal** - eliminates fields with no values +* **Placeholder filtering** - removes "0x0", "-", and vendor-specific patterns +* **Schema-aware preservation** - maintains detection-required fields +* **XML reconstruction** - creates minimal valid EventData structure +* **Level and opcode protection** - preserves operational fields where zeros matter + +## Configuration and Control + +All optimization features are fully configurable through the **DataStream** management interface or direct pipeline configuration. The system provides granular control over every optimization technique, from global enable/disable switches to field-level filtering rules. Default configuration emphasizes safety, with only field-level optimization enabled and event filtering disabled. + +Configuration options are organized hierarchically. Master switches control broad categories of optimization while detailed settings allow fine-tuned control. This structure enables quick deployment of conservative optimization settings while providing flexibility for aggressive reduction of high-volume, low-value data sources. + +```yaml +optimization: + # Master switch for all optimization features + status: true + + # Statistical sampling (disabled by default) + use_sampling: false + sample_rate: 10 + + # Event-level filtering (disabled by default) + use_event_filters: false + + # ASIM-aware field optimization (enabled by default) + use_asim_filters: true + + # Correlation ID for archive integration + append_correlationid: true +``` + +Configuration capabilities include: + +* **Conservative defaults** - field optimization on, event filtering off +* **Per-vendor customization** - different rules for each vendor +* **Per-platform settings** - optimize differently for Sentinel vs Splunk +* **Sampling rate adjustment** - configurable retention percentages +* **Custom filter rules** - user-defined filtering logic +* **Field exclusion lists** - protect specific fields from optimization +* **Correlation ID control** - enable archive integration +* **Real-time updates** - changes applied without restarts + +## Performance and Cost Impact + +The optimization capabilities in **DataStream** deliver substantial cost savings across all supported security platforms. Real-world deployments consistently achieve 55-60% data reduction through field-level optimization alone, with aggressive configurations reaching 70-80% reduction when combining field optimization, event filtering, sampling, and aggregation. + +Beyond direct storage cost savings, optimization improves query performance by reducing the amount of data that analytics engines must process. Faster queries mean more responsive security operations, reduced infrastructure requirements, and better experience for security analysts. + +```mermaid +graph LR + Before[100% Raw Data] + Field[40-45% Field Optimized] + Event[30-40% Event Filtered] + Sample[20-30% Sampled] + + Before -->|Field Optimization| Field + Field -->|Event Filtering| Event + Event -->|Sampling| Sample + + subgraph Impact[Cost & Performance Impact] + Storage[55-80% Storage Savings] + Query[2-3x Query Performance] + Network[60-70% Network Reduction] + Cost[50-80% Cost Reduction] + end + + Sample --> Impact + + style Before fill:#E5E2FB + style Field fill:#BCC0E7 + style Event fill:#BCC0E7 + style Sample fill:#BCC0E7 + style Impact fill:#E5E2FB +``` + +Measured benefits include: + +* **Storage cost reduction** - 55-60% with field optimization, 70-80% with full optimization +* **Query performance improvement** - 2-3x faster analytics queries +* **Network bandwidth savings** - 60-70% reduction in data transmission +* **Infrastructure optimization** - reduced processing and indexing overhead +* **License optimization** - lower per-GB licensing costs +* **Operational efficiency** - faster incident investigation and response + +The correlation ID system provides additional cost benefits by enabling tiered storage strategies. Organizations can maintain expensive active SIEM platforms at 40-50% of original data volume while archiving complete logs to storage costing 90% less per GB. + +## Security and Compliance Considerations + +All optimization techniques in **DataStream** are designed with security and compliance requirements as primary considerations. The field-level optimization approach based on Microsoft Sentinel content analysis ensures that no security-relevant data is eliminated. External third-party validation confirms the integrity of optimization decisions. + +For regulated environments, the correlation ID system enables compliance with data retention mandates while still achieving substantial cost savings. Complete original logs remain available in archival storage while optimized data serves day-to-day security operations. This satisfies regulatory requirements for log retention while optimizing costs for active analysis. + +```mermaid +graph TD + Optimization[Optimization Process] --> Field([Field Analysis]) + + Field --> Sentinel([Microsoft Sentinel Usage]) + Sentinel --> Detection([Detection Rules]) + Sentinel --> Parsers([ASIM Parsers]) + Sentinel --> Analytics([Analytics Queries]) + + Detection --> Validation([3rd Party Validation]) + Parsers --> Validation + Analytics --> Validation + + Validation --> Safe{Security Safe?} + + Safe -->|Yes| Deploy[Deploy Optimization] + Safe -->|No| Reject[Reject Changes] + + Deploy --> Audit([Audit Trail]) + Audit --> Compliance[Compliance Ready] + + style Optimization fill:#BCC0E7 + style Validation fill:#E5E2FB + style Compliance fill:#E5E2FB +``` + +Key security and compliance features include: + +* **Third-party validation** - external experts verify optimization safety +* **Deterministic behavior** - no AI unpredictability +* **Complete audit trail** - logging of all optimization decisions +* **Compliance mode** - pre-configured settings for regulatory requirements +* **Field protection** - guaranteed preservation of detection-relevant data +* **Archive integration** - complete logs preserved via correlation ID +* **Risk assessment reporting** - validation of optimization security impact +* **No sensitive data exposure** - no AI training on customer logs + +--- + +
+**DataStream**'s comprehensive optimization capabilities enable organizations to achieve dramatic cost savings across multiple security platforms while maintaining complete security visibility and compliance with regulatory requirements. The Risk-Free Reduction framework based on Microsoft Sentinel content analysis ensures that cost optimization never compromises security effectiveness, while the unified optimization strategy simplifies management across diverse SIEM deployments. +
+ +--- \ No newline at end of file diff --git a/sidebars.ts b/sidebars.ts index e56371d4..888b3d01 100644 --- a/sidebars.ts +++ b/sidebars.ts @@ -11,6 +11,7 @@ const sidebars: SidebarsConfig = { "about/applications", "about/architecture", "about/key-features", + "about/siem-optimization", "about/licensing", ], }, From 58ebe2b58d318839873283ef8d95788e32b6c118 Mon Sep 17 00:00:00 2001 From: Yusuf Ozturk Date: Sun, 26 Oct 2025 13:18:21 +0100 Subject: [PATCH 13/13] Rename Oracle product name --- docs/configuration/targets/oracle-cloud-os.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/configuration/targets/oracle-cloud-os.mdx b/docs/configuration/targets/oracle-cloud-os.mdx index 8a6954a4..3451052e 100644 --- a/docs/configuration/targets/oracle-cloud-os.mdx +++ b/docs/configuration/targets/oracle-cloud-os.mdx @@ -1,4 +1,4 @@ -# Oracle Cloud Infrastructure Object Storage +# Oracle Cloud Object Storage Oracle CloudLong Term Storage