[BEAM-12906] Add a dataframe extra for installing a pandas version supported by the DataFrame API (#15528)

TheNeuralBit · web-flow · commit 0111cff88025 · 2021-09-28T14:04:21.000-07:00
* Add 'dataframe' extra

* Update documentation to reference 'dataframe' extra

* Add dataframe to default extras

* fixup! Add 'dataframe' extra

* Install dataframe extra in installGcpTest task (for integration tests)
diff --git a/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy b/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy
@@ -2337,7 +2337,7 @@ class BeamModulePlugin implements Plugin<Project> {
           def distTarBall = "${pythonRootDir}/build/apache-beam.tar.gz"
           project.exec {
             executable 'sh'
-            args '-c', ". ${project.ext.envdir}/bin/activate && pip install --retries 10 ${distTarBall}[gcp,test,aws,azure]"
+            args '-c', ". ${project.ext.envdir}/bin/activate && pip install --retries 10 ${distTarBall}[gcp,test,aws,azure,dataframe]"
           }
         }
       }
diff --git a/examples/notebooks/tour-of-beam/dataframes.ipynb b/examples/notebooks/tour-of-beam/dataframes.ipynb
@@ -65,7 +65,7 @@
         "[Beam DataFrames overview](https://beam.apache.org/documentation/dsls/dataframes/overview) page.\n",
         "\n",
         "First, we need to install Apache Beam with the `interactive` extra for the Interactive runner.",
-        "We also need `pandas` for this notebook, but the Interactive runner already depends on it."
+        "We also need to install a version of `pandas` supported by the DataFrame API, which we can get with the `dataframe` extra in Beam 2.34.0 and newer."
       ],
       "metadata": {
         "id": "hDuXLLSZnI1D"
@@ -75,7 +75,7 @@
       "cell_type": "code",
       "execution_count": null,
       "source": [
-        "%pip install --quiet apache-beam[interactive]"
+        "%pip install --quiet apache-beam[interactive,dataframe]"
       ],
       "outputs": [],
       "metadata": {
diff --git a/sdks/python/apache_beam/examples/dataframe/README.md b/sdks/python/apache_beam/examples/dataframe/README.md
@@ -26,12 +26,10 @@ API](https://beam.apache.org/documentation/dsls/dataframes/overview/).
 
 You must have `apache-beam>=2.30.0` installed in order to run these pipelines,
 because the `apache_beam.examples.dataframe` module was added in that release.
-Additionally using the DataFrame API requires `pandas>=1.0.0` to be installed
-in your local Python session. The _same_ version should be installed on workers
-when executing DataFrame API pipelines on distributed runners. Reference
-[`base_image_requirements.txt`](../../../container/base_image_requirements.txt)
-for the Beam release you are using to see what version of pandas will be used
-by default on distributed workers.
+Using the DataFrame API also requires a compatible pandas version to be
+installed, see the
+[documentation](https://beam.apache.org/documentation/dsls/dataframes/overview/#pre-requisites)
+for details.
 
 ## Wordcount Pipeline
 
diff --git a/sdks/python/setup.py b/sdks/python/setup.py
@@ -165,7 +165,7 @@ def get_version():
 REQUIRED_TEST_PACKAGES = [
     'freezegun>=0.3.12',
     'mock>=1.0.1,<3.0.0',
-    'pandas>=1.0,<1.4.0',
+    'pandas<2.0.0',
     'parameterized>=0.7.1,<0.8.0',
     'pyhamcrest>=1.9,!=1.10.0,<2.0.0',
     'pyyaml>=3.12,<6.0.0',
@@ -305,7 +305,8 @@ def run(self):
         'interactive': INTERACTIVE_BEAM,
         'interactive_test': INTERACTIVE_BEAM_TEST,
         'aws': AWS_REQUIREMENTS,
-        'azure': AZURE_REQUIREMENTS
+        'azure': AZURE_REQUIREMENTS,
+        'dataframe': ['pandas>=1.0,<1.4']
     },
     zip_safe=False,
     # PyPI package information.
diff --git a/sdks/python/tox.ini b/sdks/python/tox.ini
@@ -30,7 +30,7 @@ select = E3
 # allow apps that support color to use it.
 passenv=TERM
 # Set [] options for pip installation of apache-beam tarball.
-extras = test
+extras = test,dataframe
 # Don't warn that these commands aren't installed.
 whitelist_externals =
   false
@@ -88,7 +88,7 @@ commands =
   {toxinidir}/scripts/run_pytest.sh {envname} "{posargs}"
 
 [testenv:py{36,37,38}-cloud]
-extras = test,gcp,interactive,aws,azure
+extras = test,gcp,interactive,dataframe,aws,azure
 commands =
   {toxinidir}/scripts/run_pytest.sh {envname} "{posargs}"
 
@@ -98,7 +98,7 @@ deps =
   codecov
   pytest-cov==2.9.0
 passenv = GIT_* BUILD_* ghprb* CHANGE_ID BRANCH_NAME JENKINS_* CODECOV_*
-extras = test,gcp,interactive,aws
+extras = test,gcp,interactive,dataframe,aws
 commands =
   -rm .coverage
   {toxinidir}/scripts/run_pytest.sh {envname} "{posargs}" "--cov-report=xml --cov=. --cov-append"
@@ -138,7 +138,7 @@ commands =
   python setup.py mypy
 
 [testenv:py38-docs]
-extras = test,gcp,docs,interactive
+extras = test,gcp,docs,interactive,dataframe
 deps =
   Sphinx==1.8.5
   sphinx_rtd_theme==0.4.3
@@ -197,7 +197,7 @@ commands =
 # pulls in the latest docutils. Uncomment this line once botocore does not
 # conflict with Sphinx:
 # extras = docs,test,gcp,aws,interactive,interactive_test
-extras = test,gcp,aws,interactive,interactive_test
+extras = test,gcp,aws,dataframe,interactive,interactive_test
 passenv = WORKSPACE
 commands =
   time {toxinidir}/scripts/run_dependency_check.sh
diff --git a/website/www/site/content/en/documentation/dsls/dataframes/overview.md b/website/www/site/content/en/documentation/dsls/dataframes/overview.md
@@ -30,9 +30,18 @@ The Beam DataFrame API is intended to provide access to a familiar programming i
 
 If you’re new to pandas DataFrames, you can get started by reading [10 minutes to pandas](https://pandas.pydata.org/pandas-docs/stable/user_guide/10min.html), which shows you how to import and work with the `pandas` package. pandas is an open-source Python library for data manipulation and analysis. It provides data structures that simplify working with relational or labeled data. One of these data structures is the [DataFrame](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html), which contains two-dimensional tabular data and provides labeled rows and columns for the data.
 
-## Using DataFrames
+## Pre-requisites
+
+To use Beam DataFrames, you need to install Beam python version 2.26.0 or higher (for complete setup instructions, see the [Apache Beam Python SDK Quickstart](https://beam.apache.org/get-started/quickstart-py/)) and a supported `pandas` version. In Beam 2.34.0 and newer the easiest way to do this is with the "dataframe" extra:
+
+```
+pip install apache_beam[dataframe]
+```
 
-To use Beam DataFrames, you need to install Apache Beam version 2.26.0 or higher (for complete setup instructions, see the [Apache Beam Python SDK Quickstart](https://beam.apache.org/get-started/quickstart-py/)) and pandas version 1.0 or higher. You can use DataFrames as shown in the following example, which reads New York City taxi data from a CSV file, performs a grouped aggregation, and writes the output back to CSV:
+Note that the _same_ `pandas` version should be installed on workers when executing DataFrame API pipelines on distributed runners.  Reference [`base_image_requirements.txt`](https://github.com/apache/beam/blob/master/sdks/python/container/base_image_requirements.txt) for the Beam release you are using to see what version of `pandas` will be used by default on workers.
+
+## Using DataFrames
+You can use DataFrames as shown in the following example, which reads New York City taxi data from a CSV file, performs a grouped aggregation, and writes the output back to CSV:
 
 {{< highlight py >}}
 from apache_beam.dataframe.io import read_csv

Original file line number	Diff line number	Diff line change
`@@ -2337,7 +2337,7 @@ class BeamModulePlugin implements Plugin<Project> {`
`2337`	`2337`	`def distTarBall = "${pythonRootDir}/build/apache-beam.tar.gz"`
`2338`	`2338`	`project.exec {`
`2339`	`2339`	`executable 'sh'`
`2340`		`- args '-c', ". ${project.ext.envdir}/bin/activate && pip install --retries 10 ${distTarBall}[gcp,test,aws,azure]"`
	`2340`	`+ args '-c', ". ${project.ext.envdir}/bin/activate && pip install --retries 10 ${distTarBall}[gcp,test,aws,azure,dataframe]"`
`2341`	`2341`	`}`
`2342`	`2342`	`}`
`2343`	`2343`	`}`