Skip to content

Add comprehensive Apache Spark test workflow with 6 tests including P… #1

Add comprehensive Apache Spark test workflow with 6 tests including P…

Add comprehensive Apache Spark test workflow with 6 tests including P… #1

Workflow file for this run

name: Test Spark on Arm64
on:
workflow_call:
workflow_dispatch:
push:
branches:
- main
- smoke_tests
paths:
- 'content/opensource_packages/spark.md'
- '.github/workflows/test-spark.yml'
jobs:
test-spark:
runs-on: ubuntu-24.04-arm
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Set test metadata
id: metadata
run: |
echo "timestamp=$(date -u +%Y-%m-%dT%H:%M:%SZ)" >> $GITHUB_OUTPUT
echo "package_slug=spark" >> $GITHUB_OUTPUT
echo "dashboard_link=/opensource_packages/spark" >> $GITHUB_OUTPUT
# ============================================================
# Install Apache Spark and dependencies
# ============================================================
- name: Install Java (Spark dependency)
id: install_java
run: |
echo "Installing Java 17..."
sudo apt-get update
sudo apt-get install -y openjdk-17-jdk
java -version
echo "JAVA_HOME=$(dirname $(dirname $(readlink -f $(which java))))" >> $GITHUB_ENV
- name: Install Apache Spark
id: install
run: |
echo "Installing Apache Spark..."
# Download Spark 3.5.3 (latest stable)
SPARK_VERSION="3.5.3"
HADOOP_VERSION="3"
cd /tmp
wget -q https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz
# Extract to /opt
sudo tar -xzf spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz -C /opt/
sudo ln -s /opt/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION} /opt/spark
# Set environment variables
echo "SPARK_HOME=/opt/spark" >> $GITHUB_ENV
echo "/opt/spark/bin" >> $GITHUB_PATH
# Verify installation
if [ -d "/opt/spark" ]; then
echo "Spark installed successfully"
echo "install_status=success" >> $GITHUB_OUTPUT
else
echo "Spark installation failed"
echo "install_status=failed" >> $GITHUB_OUTPUT
exit 1
fi
# ============================================================
# Detect version
# ============================================================
- name: Detect Spark version
id: version
run: |
# Spark doesn't have a --version flag, so we read from the installation directory
if [ -f /opt/spark/RELEASE ]; then
VERSION=$(cat /opt/spark/RELEASE | grep "Spark" | head -n 1 | sed 's/.*Spark \([0-9.]*\).*/\1/')
else
# Fallback: check directory name
VERSION=$(basename $(readlink -f /opt/spark) | grep -oP '(?<=spark-)[0-9.]+')
fi
echo "version=$VERSION" >> $GITHUB_OUTPUT
echo "Detected Spark version: $VERSION"
# ============================================================
# Run tests
# ============================================================
- name: Test 1 - Check spark-submit exists
id: test1
run: |
START_TIME=$(date +%s)
if command -v spark-submit &> /dev/null; then
echo "✓ spark-submit command found"
which spark-submit
echo "status=passed" >> $GITHUB_OUTPUT
else
echo "✗ spark-submit command not found"
echo "status=failed" >> $GITHUB_OUTPUT
exit 1
fi
END_TIME=$(date +%s)
echo "duration=$((END_TIME - START_TIME))" >> $GITHUB_OUTPUT
- name: Test 2 - Check spark-shell exists
id: test2
run: |
START_TIME=$(date +%s)
if command -v spark-shell &> /dev/null; then
echo "✓ spark-shell command found"
which spark-shell
echo "status=passed" >> $GITHUB_OUTPUT
else
echo "✗ spark-shell command not found"
echo "status=failed" >> $GITHUB_OUTPUT
exit 1
fi
END_TIME=$(date +%s)
echo "duration=$((END_TIME - START_TIME))" >> $GITHUB_OUTPUT
- name: Test 3 - Check pyspark exists
id: test3
run: |
START_TIME=$(date +%s)
if command -v pyspark &> /dev/null; then
echo "✓ pyspark command found"
which pyspark
echo "status=passed" >> $GITHUB_OUTPUT
else
echo "✗ pyspark command not found"
echo "status=failed" >> $GITHUB_OUTPUT
exit 1
fi
END_TIME=$(date +%s)
echo "duration=$((END_TIME - START_TIME))" >> $GITHUB_OUTPUT
- name: Test 4 - Verify Java dependency
id: test4
run: |
START_TIME=$(date +%s)
if java -version 2>&1 | grep -q "openjdk"; then
echo "✓ Java is installed and accessible"
java -version
echo "status=passed" >> $GITHUB_OUTPUT
else
echo "✗ Java check failed"
echo "status=failed" >> $GITHUB_OUTPUT
exit 1
fi
END_TIME=$(date +%s)
echo "duration=$((END_TIME - START_TIME))" >> $GITHUB_OUTPUT
- name: Test 5 - Run simple Spark application
id: test5
run: |
START_TIME=$(date +%s)
# Create a simple Spark application (Scala)
cat > /tmp/SparkTest.scala << 'EOF'
import org.apache.spark.sql.SparkSession
object SparkTest {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.appName("ARM64 Test")
.master("local[1]")
.getOrCreate()
val data = Seq(("ARM64", 1), ("Spark", 2), ("Test", 3))
val df = spark.createDataFrame(data).toDF("name", "value")
println("DataFrame created successfully:")
df.show()
spark.stop()
}
}
EOF
# Run the Spark example that comes with the distribution
# Use the SparkPi example as it's simple and requires no external dependencies
if /opt/spark/bin/run-example SparkPi 10 2>&1 | grep -q "Pi is roughly"; then
echo "✓ Spark application executed successfully"
echo "status=passed" >> $GITHUB_OUTPUT
else
echo "✗ Spark application failed to execute"
echo "status=failed" >> $GITHUB_OUTPUT
exit 1
fi
END_TIME=$(date +%s)
echo "duration=$((END_TIME - START_TIME))" >> $GITHUB_OUTPUT
- name: Test 6 - Run PySpark test
id: test6
run: |
START_TIME=$(date +%s)
# Install Python if not present
sudo apt-get install -y python3 python3-pip
# Create a simple PySpark script
cat > /tmp/pyspark_test.py << 'EOF'
from pyspark.sql import SparkSession
spark = SparkSession.builder \
.appName("ARM64 PySpark Test") \
.master("local[1]") \
.getOrCreate()
data = [("ARM64", 1), ("PySpark", 2), ("Test", 3)]
df = spark.createDataFrame(data, ["name", "value"])
print("PySpark DataFrame created successfully:")
df.show()
# Verify we can perform operations
count = df.count()
print(f"Row count: {count}")
spark.stop()
# Exit with success if we got here
if count == 3:
exit(0)
else:
exit(1)
EOF
# Run the PySpark script
if /opt/spark/bin/spark-submit /tmp/pyspark_test.py 2>&1 | grep -q "PySpark DataFrame created successfully"; then
echo "✓ PySpark application executed successfully"
echo "status=passed" >> $GITHUB_OUTPUT
else
echo "✗ PySpark application failed to execute"
echo "status=failed" >> $GITHUB_OUTPUT
exit 1
fi
END_TIME=$(date +%s)
echo "duration=$((END_TIME - START_TIME))" >> $GITHUB_OUTPUT
# ============================================================
# Calculate summary
# ============================================================
- name: Calculate test summary
if: always()
id: summary
run: |
PASSED=0
FAILED=0
TOTAL_DURATION=0
# Test 1
if [ "${{ steps.test1.outputs.status }}" == "passed" ]; then
PASSED=$((PASSED + 1))
else
FAILED=$((FAILED + 1))
fi
TOTAL_DURATION=$((TOTAL_DURATION + ${{ steps.test1.outputs.duration || 0 }}))
# Test 2
if [ "${{ steps.test2.outputs.status }}" == "passed" ]; then
PASSED=$((PASSED + 1))
else
FAILED=$((FAILED + 1))
fi
TOTAL_DURATION=$((TOTAL_DURATION + ${{ steps.test2.outputs.duration || 0 }}))
# Test 3
if [ "${{ steps.test3.outputs.status }}" == "passed" ]; then
PASSED=$((PASSED + 1))
else
FAILED=$((FAILED + 1))
fi
TOTAL_DURATION=$((TOTAL_DURATION + ${{ steps.test3.outputs.duration || 0 }}))
# Test 4
if [ "${{ steps.test4.outputs.status }}" == "passed" ]; then
PASSED=$((PASSED + 1))
else
FAILED=$((FAILED + 1))
fi
TOTAL_DURATION=$((TOTAL_DURATION + ${{ steps.test4.outputs.duration || 0 }}))
# Test 5
if [ "${{ steps.test5.outputs.status }}" == "passed" ]; then
PASSED=$((PASSED + 1))
else
FAILED=$((FAILED + 1))
fi
TOTAL_DURATION=$((TOTAL_DURATION + ${{ steps.test5.outputs.duration || 0 }}))
# Test 6
if [ "${{ steps.test6.outputs.status }}" == "passed" ]; then
PASSED=$((PASSED + 1))
else
FAILED=$((FAILED + 1))
fi
TOTAL_DURATION=$((TOTAL_DURATION + ${{ steps.test6.outputs.duration || 0 }}))
echo "passed=$PASSED" >> $GITHUB_OUTPUT
echo "failed=$FAILED" >> $GITHUB_OUTPUT
echo "duration=$TOTAL_DURATION" >> $GITHUB_OUTPUT
if [ $FAILED -eq 0 ]; then
echo "overall_status=success" >> $GITHUB_OUTPUT
echo "badge_status=passing" >> $GITHUB_OUTPUT
else
echo "overall_status=failure" >> $GITHUB_OUTPUT
echo "badge_status=failing" >> $GITHUB_OUTPUT
fi
# ============================================================
# Generate JSON with Spark metadata
# ============================================================
- name: Generate test results JSON
if: always()
run: |
mkdir -p test-results
cat > test-results/spark.json << EOF
{
"schema_version": "1.0",
"package": {
"name": "Apache Spark",
"version": "${{ steps.version.outputs.version }}",
"language": "scala",
"category": "Databases - Big-data"
},
"run": {
"id": "${{ github.run_id }}",
"url": "${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}",
"timestamp": "${{ steps.metadata.outputs.timestamp }}",
"status": "${{ steps.summary.outputs.overall_status }}",
"runner": {
"os": "ubuntu-24.04",
"arch": "arm64"
}
},
"tests": {
"passed": ${{ steps.summary.outputs.passed }},
"failed": ${{ steps.summary.outputs.failed }},
"skipped": 0,
"duration_seconds": ${{ steps.summary.outputs.duration }},
"details": [
{
"name": "Check spark-submit exists",
"status": "${{ steps.test1.outputs.status }}",
"duration_seconds": ${{ steps.test1.outputs.duration || 0 }}
},
{
"name": "Check spark-shell exists",
"status": "${{ steps.test2.outputs.status }}",
"duration_seconds": ${{ steps.test2.outputs.duration || 0 }}
},
{
"name": "Check pyspark exists",
"status": "${{ steps.test3.outputs.status }}",
"duration_seconds": ${{ steps.test3.outputs.duration || 0 }}
},
{
"name": "Verify Java dependency",
"status": "${{ steps.test4.outputs.status }}",
"duration_seconds": ${{ steps.test4.outputs.duration || 0 }}
},
{
"name": "Run simple Spark application",
"status": "${{ steps.test5.outputs.status }}",
"duration_seconds": ${{ steps.test5.outputs.duration || 0 }}
},
{
"name": "Run PySpark test",
"status": "${{ steps.test6.outputs.status }}",
"duration_seconds": ${{ steps.test6.outputs.duration || 0 }}
}
]
},
"metadata": {
"dashboard_link": "${{ steps.metadata.outputs.dashboard_link }}",
"badge_status": "${{ steps.summary.outputs.badge_status }}"
}
}
EOF
echo "Generated test results:"
cat test-results/spark.json
# ============================================================
# STANDARD STEPS - Commit results to repository
# ============================================================
- name: Upload test results
if: always()
uses: actions/upload-artifact@v4
with:
name: spark-test-results
path: test-results/spark.json
retention-days: 90
- name: Commit test results to repository
if: always() && (github.ref == 'refs/heads/main' || github.ref == 'refs/heads/smoke_tests')
run: |
git config --global user.name 'github-actions[bot]'
git config --global user.email 'github-actions[bot]@users.noreply.github.com'
mkdir -p data/test-results
cp test-results/spark.json data/test-results/spark.json
git add data/test-results/spark.json
if ! git diff --staged --quiet; then
git commit -m "Update spark test results [skip ci]"
# Retry logic for push
MAX_RETRIES=5
RETRY_COUNT=0
while [ $RETRY_COUNT -lt $MAX_RETRIES ]; do
if git push origin ${{ github.ref_name }}; then
echo "Successfully pushed test results"
break
else
RETRY_COUNT=$((RETRY_COUNT + 1))
if [ $RETRY_COUNT -lt $MAX_RETRIES ]; then
echo "Push failed, attempt $RETRY_COUNT of $MAX_RETRIES. Retrying in 5 seconds..."
sleep 5
git pull --rebase origin ${{ github.ref_name }}
else
echo "Failed to push after $MAX_RETRIES attempts"
exit 1
fi
fi
done
else
echo "No changes to commit"
fi