apache
diff --git a/‎.asf.yaml‎
Lines changed: 4 additions & 0 deletions b/‎.asf.yaml‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎.github/workflows/build_and_test.yml‎
Lines changed: 69 additions & 30 deletions b/‎.github/workflows/build_and_test.yml‎
Lines changed: 69 additions & 30 deletions
@@ -27,3 +27,7 @@ github:
     - jdbc
     - sql
     - spark
+  enabled_merge_buttons:
+    merge: false
+    squash: true
+    rebase: true
@@ -5,11 +5,44 @@ on:
     branches:
     - '**'
     - '!branch-*.*'
+  schedule:
+    # master
+    - cron: '0 4 * * *'
+    # branch-3.2
+    - cron: '0 7 * * *'
 
 jobs:
+  configure-jobs:
+    name: Configure jobs
+    runs-on: ubuntu-20.04
+    outputs:
+      branch: ${{ steps.set-outputs.outputs.branch }}
+      type: ${{ steps.set-outputs.outputs.type }}
+      envs: ${{ steps.set-outputs.outputs.envs }}
+    steps:
+    - name: Configure branch and additional environment variables
+      id: set-outputs
+      run: |
+        if [ "${{ github.event.schedule }}" = "0 4 * * *" ]; then
+          echo '::set-output name=branch::master'
+          echo '::set-output name=type::scheduled'
+          echo '::set-output name=envs::{"SCALA_PROFILE": "scala2.13"}'
+        elif [ "${{ github.event.schedule }}" = "0 7 * * *" ]; then
+          echo '::set-output name=branch::branch-3.2'
+          echo '::set-output name=type::scheduled'
+          echo '::set-output name=envs::{"SCALA_PROFILE": "scala2.13"}'
+        else
+          echo '::set-output name=branch::master' # Default branch to run on. CHANGE here when a branch is cut out.
+          echo '::set-output name=type::regular'
+          echo '::set-output name=envs::{}'
+        fi
+
   # Build: build Spark and run the tests for specified modules.
   build:
-    name: "Build modules: ${{ matrix.modules }} ${{ matrix.comment }} (JDK ${{ matrix.java }}, ${{ matrix.hadoop }}, ${{ matrix.hive }})"
+    name: "Build modules (${{ format('{0}, {1} job', needs.configure-jobs.outputs.branch, needs.configure-jobs.outputs.type) }}): ${{ matrix.modules }} ${{ matrix.comment }} (JDK ${{ matrix.java }}, ${{ matrix.hadoop }}, ${{ matrix.hive }})"
+    needs: configure-jobs
+    # Do not run as scheduled jobs in forked repos
+    if: github.repository == 'apache/spark' || needs.configure-jobs.outputs.type == 'regular'
     # Ubuntu 20.04 is the latest LTS. The next LTS is 22.04.
     runs-on: ubuntu-20.04
     strategy:
@@ -81,16 +114,14 @@ jobs:
       with:
         fetch-depth: 0
         repository: apache/spark
-        ref: master
+        ref: ${{ needs.configure-jobs.outputs.branch }}
     - name: Sync the current branch with the latest in Apache Spark
       if: github.repository != 'apache/spark'
-      id: sync-branch
       run: |
-        apache_spark_ref=`git rev-parse HEAD`
+        echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV
         git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
         git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' merge --no-commit --progress --squash FETCH_HEAD
         git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' commit -m "Merged commit"
-        echo "::set-output name=APACHE_SPARK_REF::$apache_spark_ref"
     # Cache local repositories. Note that GitHub Actions cache has a 2G limit.
     - name: Cache Scala, SBT and Maven
       uses: actions/cache@v2
@@ -130,11 +161,12 @@ jobs:
         python3.8 -m pip list
     # Run the tests.
     - name: Run tests
+      env: ${{ fromJSON(needs.configure-jobs.outputs.envs) }}
       run: |
-        export APACHE_SPARK_REF=${{ steps.sync-branch.outputs.APACHE_SPARK_REF }}
-        # Hive and SQL tests become flaky when running in parallel as it's too intensive.
-        if [[ "$MODULES_TO_TEST" == "hive" ]] || [[ "$MODULES_TO_TEST" == "sql" ]]; then export SERIAL_SBT_TESTS=1; fi
-        ./dev/run-tests --parallelism 2 --modules "$MODULES_TO_TEST" --included-tags "$INCLUDED_TAGS" --excluded-tags "$EXCLUDED_TAGS"
+        # Hive "other tests" test needs larger metaspace size based on experiment.
+        if [[ "$MODULES_TO_TEST" == "hive" ]] && [[ "$EXCLUDED_TAGS" == "org.apache.spark.tags.SlowHiveTest" ]]; then export METASPACE_SIZE=2g; fi
+        export SERIAL_SBT_TESTS=1
+        ./dev/run-tests --parallelism 1 --modules "$MODULES_TO_TEST" --included-tags "$INCLUDED_TAGS" --excluded-tags "$EXCLUDED_TAGS"
     - name: Upload test results to report
       if: always()
       uses: actions/upload-artifact@v2
@@ -149,6 +181,8 @@ jobs:
         path: "**/target/unit-tests.log"
 
   pyspark:
+    needs: configure-jobs
+    if: needs.configure-jobs.outputs.type == 'regular'
     name: "Build modules: ${{ matrix.modules }}"
     runs-on: ubuntu-20.04
     container:
@@ -171,6 +205,9 @@ jobs:
       HIVE_PROFILE: hive2.3
       GITHUB_PREV_SHA: ${{ github.event.before }}
       SPARK_LOCAL_IP: localhost
+      SKIP_UNIDOC: true
+      SKIP_MIMA: true
+      METASPACE_SIZE: 128m
     steps:
     - name: Checkout Spark repository
       uses: actions/checkout@v2
@@ -181,13 +218,11 @@ jobs:
         ref: master
     - name: Sync the current branch with the latest in Apache Spark
       if: github.repository != 'apache/spark'
-      id: sync-branch
       run: |
-        apache_spark_ref=`git rev-parse HEAD`
+        echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV
         git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
         git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' merge --no-commit --progress --squash FETCH_HEAD
         git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' commit -m "Merged commit"
-        echo "::set-output name=APACHE_SPARK_REF::$apache_spark_ref"
     # Cache local repositories. Note that GitHub Actions cache has a 2G limit.
     - name: Cache Scala, SBT and Maven
       uses: actions/cache@v2
@@ -217,9 +252,8 @@ jobs:
     # Run the tests.
     - name: Run tests
       run: |
-        export APACHE_SPARK_REF=${{ steps.sync-branch.outputs.APACHE_SPARK_REF }}
         export PATH=$PATH:$HOME/miniconda/bin
-        ./dev/run-tests --parallelism 2 --modules "$MODULES_TO_TEST"
+        ./dev/run-tests --parallelism 1 --modules "$MODULES_TO_TEST"
     - name: Upload test results to report
       if: always()
       uses: actions/upload-artifact@v2
@@ -234,6 +268,8 @@ jobs:
         path: "**/target/unit-tests.log"
 
   sparkr:
+    needs: configure-jobs
+    if: needs.configure-jobs.outputs.type == 'regular'
     name: "Build modules: sparkr"
     runs-on: ubuntu-20.04
     container:
@@ -243,6 +279,7 @@ jobs:
       HIVE_PROFILE: hive2.3
       GITHUB_PREV_SHA: ${{ github.event.before }}
       SPARK_LOCAL_IP: localhost
+      SKIP_MIMA: true
     steps:
     - name: Checkout Spark repository
       uses: actions/checkout@v2
@@ -253,13 +290,11 @@ jobs:
         ref: master
     - name: Sync the current branch with the latest in Apache Spark
       if: github.repository != 'apache/spark'
-      id: sync-branch
       run: |
-        apache_spark_ref=`git rev-parse HEAD`
+        echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV
         git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
         git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' merge --no-commit --progress --squash FETCH_HEAD
         git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' commit -m "Merged commit"
-        echo "::set-output name=APACHE_SPARK_REF::$apache_spark_ref"
     # Cache local repositories. Note that GitHub Actions cache has a 2G limit.
     - name: Cache Scala, SBT and Maven
       uses: actions/cache@v2
@@ -285,8 +320,7 @@ jobs:
         # R issues at docker environment
         export TZ=UTC
         export _R_CHECK_SYSTEM_CLOCK_=FALSE
-        export APACHE_SPARK_REF=${{ steps.sync-branch.outputs.APACHE_SPARK_REF }}
-        ./dev/run-tests --parallelism 2 --modules sparkr
+        ./dev/run-tests --parallelism 1 --modules sparkr
     - name: Upload test results to report
       if: always()
       uses: actions/upload-artifact@v2
@@ -296,6 +330,8 @@ jobs:
 
   # Static analysis, and documentation build
   lint:
+    needs: configure-jobs
+    if: needs.configure-jobs.outputs.type == 'regular'
     name: Linters, licenses, dependencies and documentation generation
     runs-on: ubuntu-20.04
     env:
@@ -313,7 +349,6 @@ jobs:
         ref: master
     - name: Sync the current branch with the latest in Apache Spark
       if: github.repository != 'apache/spark'
-      id: sync-branch
       run: |
         git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
         git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' merge --no-commit --progress --squash FETCH_HEAD
@@ -382,7 +417,7 @@ jobs:
     - name: Java linter
       run: ./dev/lint-java
     - name: Python linter
-      run: ./dev/lint-python
+      run: PYTHON_EXECUTABLE=python3.9 ./dev/lint-python
     - name: R linter
       run: ./dev/lint-r
     - name: JS linter
@@ -397,6 +432,8 @@ jobs:
         bundle exec jekyll build
 
   java-11-17:
+    needs: configure-jobs
+    if: needs.configure-jobs.outputs.type == 'regular'
     name: Java ${{ matrix.java }} build with Maven
     strategy:
       fail-fast: false
@@ -414,7 +451,6 @@ jobs:
         ref: master
     - name: Sync the current branch with the latest in Apache Spark
       if: github.repository != 'apache/spark'
-      id: sync-branch
       run: |
         git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
         git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' merge --no-commit --progress --squash FETCH_HEAD
@@ -451,6 +487,8 @@ jobs:
         rm -rf ~/.m2/repository/org/apache/spark
 
   scala-213:
+    needs: configure-jobs
+    if: needs.configure-jobs.outputs.type == 'regular'
     name: Scala 2.13 build with SBT
     runs-on: ubuntu-20.04
     steps:
@@ -462,7 +500,6 @@ jobs:
         ref: master
     - name: Sync the current branch with the latest in Apache Spark
       if: github.repository != 'apache/spark'
-      id: sync-branch
       run: |
         git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
         git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' merge --no-commit --progress --squash FETCH_HEAD
@@ -495,6 +532,8 @@ jobs:
         ./build/sbt -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Pkinesis-asl -Pdocker-integration-tests -Pkubernetes-integration-tests -Pspark-ganglia-lgpl -Pscala-2.13 compile test:compile
 
   hadoop-2:
+    needs: configure-jobs
+    if: needs.configure-jobs.outputs.type == 'regular'
     name: Hadoop 2 build with SBT
     runs-on: ubuntu-20.04
     steps:
@@ -506,7 +545,6 @@ jobs:
         ref: master
     - name: Sync the current branch with the latest in Apache Spark
       if: github.repository != 'apache/spark'
-      id: sync-branch
       run: |
         git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
         git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' merge --no-commit --progress --squash FETCH_HEAD
@@ -538,6 +576,8 @@ jobs:
         ./build/sbt -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Pkinesis-asl -Phadoop-2.7 compile test:compile
 
   tpcds-1g:
+    needs: configure-jobs
+    if: needs.configure-jobs.outputs.type == 'regular'
     name: Run TPC-DS queries with SF=1
     runs-on: ubuntu-20.04
     env:
@@ -551,7 +591,6 @@ jobs:
         ref: master
     - name: Sync the current branch with the latest in Apache Spark
       if: github.repository != 'apache/spark'
-      id: sync-branch
       run: |
         git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
         git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' merge --no-commit --progress --squash FETCH_HEAD
@@ -614,6 +653,8 @@ jobs:
         path: "**/target/unit-tests.log"
 
   docker-integration-tests:
+    needs: configure-jobs
+    if: needs.configure-jobs.outputs.type == 'regular'
     name: Run docker integration tests
     runs-on: ubuntu-20.04
     env:
@@ -622,6 +663,7 @@ jobs:
       GITHUB_PREV_SHA: ${{ github.event.before }}
       SPARK_LOCAL_IP: localhost
       ORACLE_DOCKER_IMAGE_NAME: oracle/database:18.4.0-xe
+      SKIP_MIMA: true
     steps:
     - name: Checkout Spark repository
       uses: actions/checkout@v2
@@ -631,13 +673,11 @@ jobs:
         ref: master
     - name: Sync the current branch with the latest in Apache Spark
       if: github.repository != 'apache/spark'
-      id: sync-branch
       run: |
-        apache_spark_ref=`git rev-parse HEAD`
+        echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV
         git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
         git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' merge --no-commit --progress --squash FETCH_HEAD
         git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' commit -m "Merged commit"
-        echo "::set-output name=APACHE_SPARK_REF::$apache_spark_ref"
     - name: Cache Scala, SBT and Maven
       uses: actions/cache@v2
       with:
@@ -680,8 +720,7 @@ jobs:
         ./buildContainerImage.sh -v 18.4.0 -x
     - name: Run tests
       run: |
-        export APACHE_SPARK_REF=${{ steps.sync-branch.outputs.APACHE_SPARK_REF }}
-        ./dev/run-tests --parallelism 2 --modules docker-integration-tests --included-tags org.apache.spark.tags.DockerTest
+        ./dev/run-tests --parallelism 1 --modules docker-integration-tests --included-tags org.apache.spark.tags.DockerTest
     - name: Upload test results to report
       if: always()
       uses: actions/upload-artifact@v2