Merge branch 'master' of https://github.com/apache/zeppelin into apache-master

xiejiajun · xiejiajun · commit 5569788b33b0 · 2020-07-07T16:24:33.000+08:00
diff --git a/cassandra/src/main/scala/org/apache/zeppelin/cassandra/ParagraphParser.scala b/cassandra/src/main/scala/org/apache/zeppelin/cassandra/ParagraphParser.scala
@@ -71,7 +71,7 @@ object ParagraphParser {
 
   val GENERIC_STATEMENT_PREFIX: Regex =
     """(?is)\s*(?:INSERT|UPDATE|DELETE|SELECT|CREATE|ALTER|
-      |DROP|GRANT|REVOKE|TRUNCATE|LIST|USE)\s+""".r
+      |DROP|GRANT|REVOKE|TRUNCATE|LIST|USE|[a-z]\w+)\s+""".r
 
   val VALID_IDENTIFIER = "[a-z][a-z0-9_]*"
 
diff --git a/docs/_includes/themes/zeppelin/_navigation.html b/docs/_includes/themes/zeppelin/_navigation.html
@@ -31,6 +31,7 @@
                 <li class="title"><span>Run Mode</span></li>
                 <li><a href="{{BASE_PATH}}/quickstart/kubernetes.html">Kubernetes</a></li>
                 <li><a href="{{BASE_PATH}}/quickstart/docker.html">Docker</a></li>
+                <li><a href="{{BASE_PATH}}/quickstart/yarn.html">Yarn</a></li>
                 <li role="separator" class="divider"></li>
                 <li><a href="{{BASE_PATH}}/quickstart/spark_with_zeppelin.html">Spark with Zeppelin</a></li>
                 <li><a href="{{BASE_PATH}}/quickstart/sql_with_zeppelin.html">SQL with Zeppelin</a></li>
@@ -85,6 +86,7 @@
               <ul class="dropdown-menu scrollable-menu">
                 <li class="title"><span>Basics</span></li>
                 <li><a href="{{BASE_PATH}}/setup/basics/how_to_build.html">How to Build Zeppelin</a></li>
+                <li><a href="{{BASE_PATH}}/setup/basics/hadoop_integration.html">Hadoop Integration</a></li>
                 <li><a href="{{BASE_PATH}}/setup/basics/multi_user_support.html">Multi-user Support</a></li>
                 <li role="separator" class="divider"></li>
                 <li class="title"><span>Deployment</span></li>
diff --git a/docs/quickstart/yarn.md b/docs/quickstart/yarn.md
@@ -0,0 +1,75 @@
+---
+layout: page
+title: "Zeppelin on Yarn"
+description: "Apache Zeppelin supports to run interpreter process in yarn containers"
+group: usage/interpreter 
+---
+<!--
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+{% include JB/setup %}
+
+# Zeppelin on Yarn
+
+<div id="toc"></div>
+
+Zeppelin on yarn means to run interpreter process in yarn container. The key benefit is the scalability, you won't run out of memory
+of the zeppelin server host if you run large amount of interpreter processes.
+
+## Prerequisites
+The following is required for yarn interpreter mode.
+
+* Hadoop client (both 2.x and 3.x are supported) is installed.
+* `$HADOOP_HOME/bin` is put in `PATH`. Because internally zeppelin will run command `hadoop classpath` to get all the hadoop jars and put them in the classpath of Zeppelin.
+* Set `USE_HADOOP` as `true` in `zeppelin-env.sh`.
+
+## Configuration
+
+Yarn interpreter mode needs to be set for each interpreter. You can set `zeppelin.interpreter.launcher` to be `yarn` to run it in yarn mode.
+Besides that, you can also specify other properties as following table.
+
+<table class="table-configuration">
+  <tr>
+    <th>Name</th>
+    <th>Default Value</th>
+    <th>Description</th>
+  </tr>
+  <tr>
+    <td>zeppelin.interpreter.yarn.resource.memory</td>
+    <td>1024</td>
+    <td>memory for interpreter process, unit: mb</td>
+  </tr>
+  <tr>
+    <td>zeppelin.interpreter.yarn.resource.memoryOverhead</td>
+    <td>Amount of non-heap memory to be allocated per interpreter process in yarn interpreter mode, in MiB unless otherwise specified. This is memory that accounts for things like VM overheads, interned strings, other native overheads, etc.</td>
+  </tr>
+  <tr>
+    <td>zeppelin.interpreter.yarn.resource.cores</td>
+    <td>1</td>
+    <td>cpu cores for interpreter process</td>
+  </tr>
+  <tr>
+    <td>zeppelin.interpreter.yarn.queue</td>
+    <td>default</td>
+    <td>yarn queue name</td>
+  </tr>
+</table>
+
+## Differences with non-yarn interpreter mode (local mode)
+
+There're several differences between yarn interpreter mode with non-yarn interpreter mode (local mode)
+
+* New yarn app will be allocated for the interpreter process.
+* Any local path setting won't work in yarn interpreter process. E.g. if you run python interpreter in yarn interpreter mode, then you need to make sure the python executable of `zeppelin.python` exist in all the nodes of yarn cluster. 
+Because the python interpreter may launch in any node.
+* Don't use it for spark interpreter. Instead use spark's built-in yarn-client or yarn-cluster which is more suitable for spark interpreter.
diff --git a/docs/setup/basics/hadoop_integration.md b/docs/setup/basics/hadoop_integration.md
@@ -0,0 +1,39 @@
+---
+layout: page
+title: "How to integrate with hadoop"
+description: "How to integrate with hadoop"
+group: setup/basics
+---
+<!--
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+{% include JB/setup %}
+
+# Integrate with hadoop
+
+<div id="toc"></div>
+
+Hadoop is an optional component of zeppelin unless you need the following features
+
+* Use hdfs to store notes. 
+* Use hdfs to store interpreter configuration
+* Use hdfs to store recovery data
+* Launch interpreter in yarn mode
+
+## Requirements
+
+In Zeppelin 0.9 doesn't ship with hadoop dependencies, you need to include hadoop jars by yourself via the following steps
+
+* Hadoop client (both 2.x and 3.x are supported) is installed.
+* `$HADOOP_HOME/bin` is put in `PATH`. Because internally zeppelin will run command `hadoop classpath` to get all the hadoop jars and put them in the classpath of Zeppelin.
+* Set `USE_HADOOP` as `true` in `zeppelin-env.sh`.
diff --git a/spark/interpreter/src/main/resources/python/zeppelin_ipyspark.py b/spark/interpreter/src/main/resources/python/zeppelin_ipyspark.py
@@ -73,3 +73,8 @@ def show(self, obj, **kwargs):
             super(IPySparkZeppelinContext, self).show(obj, **kwargs)
 
 z = __zeppelin__ = IPySparkZeppelinContext(intp.getZeppelinContext(), gateway)
+
+# add jars to path
+import sys
+jarlist = map(lambda url: url.replace("file:/", "/"), (conf.get("spark.jars") or "").split(","))
+sys.path.extend(filter(lambda jar: jar not in sys.path, jarlist))
diff --git a/spark/interpreter/src/main/resources/python/zeppelin_pyspark.py b/spark/interpreter/src/main/resources/python/zeppelin_pyspark.py
@@ -71,3 +71,8 @@ def show(self, obj, **kwargs):
 
 z = __zeppelin__ = PySparkZeppelinContext(intp.getZeppelinContext(), gateway)
 __zeppelin__._setup_matplotlib()
+
+# add jars to path
+import sys
+jarlist = map(lambda url: url.replace("file:/", "/"), (conf.get("spark.jars") or "").split(","))
+sys.path.extend(filter(lambda jar: jar not in sys.path, jarlist))
diff --git a/zeppelin-interpreter-integration/src/test/java/org/apache/zeppelin/integration/ZeppelinSparkClusterTest.java b/zeppelin-interpreter-integration/src/test/java/org/apache/zeppelin/integration/ZeppelinSparkClusterTest.java
@@ -1038,6 +1038,20 @@ public void testConfInterpreter() throws IOException {
       p1.setText("%spark\nimport com.databricks.spark.csv._");
       note.run(p1.getId(), true);
       assertEquals(Status.FINISHED, p1.getStatus());
+
+      // test pyspark imports path
+      Paragraph p2 = note.addNewParagraph(anonymous);
+      p2.setText("%spark.pyspark\nimport sys\nsys.path");
+      note.run(p2.getId(), true);
+      assertEquals(Status.FINISHED, p2.getStatus());
+      assertTrue(p2.getReturn().toString().contains("databricks_spark"));
+
+      Paragraph p3 = note.addNewParagraph(anonymous);
+      p3.setText("%spark.ipyspark\nimport sys\nsys.path");
+      note.run(p3.getId(), true);
+      assertEquals(Status.FINISHED, p3.getStatus());
+      assertTrue(p3.getReturn().toString().contains("databricks_spark"));
+
     } finally {
       if (null != note) {
         TestUtils.getInstance(Notebook.class).removeNote(note, anonymous);
diff --git a/zeppelin-server/src/main/java/org/apache/zeppelin/service/JobManagerService.java b/zeppelin-server/src/main/java/org/apache/zeppelin/service/JobManagerService.java
@@ -81,8 +81,8 @@ public List<NoteJobInfo> getNoteJobInfoByUnixTime(long lastUpdateServerUnixTime,
     if (!conf.isJobManagerEnabled()) {
       return new ArrayList<>();
     }
-    List<NoteJobInfo> notesJobInfo = new ArrayList<>();
-    notebook.getNoteStream()
+
+    List<NoteJobInfo> notesJobInfo = notebook.getNoteStream()
             .filter(note -> authorizationService.isOwner(context.getUserAndRoles(), note.getId()))
             .map(note -> new NoteJobInfo(note))
             .filter(noteJobInfo -> noteJobInfo.unixTimeLastRun > lastUpdateServerUnixTime)
diff --git a/zeppelin-server/src/main/java/org/apache/zeppelin/socket/NotebookServer.java b/zeppelin-server/src/main/java/org/apache/zeppelin/socket/NotebookServer.java
@@ -506,8 +506,10 @@ public void onFailure(Exception ex, ServiceContext context) throws IOException {
         });
   }
 
-  public void broadcastUpdateNoteJobInfo(long lastUpdateUnixTime) throws IOException {
-    getJobManagerService().getNoteJobInfoByUnixTime(lastUpdateUnixTime, null,
+  public void broadcastUpdateNoteJobInfo(Note note, long lastUpdateUnixTime) throws IOException {
+    ServiceContext context = new ServiceContext(new AuthenticationInfo(),
+            getNotebookAuthorizationService().getOwners(note.getId()));
+    getJobManagerService().getNoteJobInfoByUnixTime(lastUpdateUnixTime, context,
         new WebSocketServiceCallback<List<JobManagerService.NoteJobInfo>>(null) {
           @Override
           public void onSuccess(List<JobManagerService.NoteJobInfo> notesJobInfo,
@@ -1799,7 +1801,9 @@ public void run() {
   @Override
   public void onParagraphRemove(Paragraph p) {
     try {
-      getJobManagerService().getNoteJobInfoByUnixTime(System.currentTimeMillis() - 5000, null,
+      ServiceContext context = new ServiceContext(new AuthenticationInfo(),
+              getNotebookAuthorizationService().getOwners(p.getNote().getId()));
+      getJobManagerService().getNoteJobInfoByUnixTime(System.currentTimeMillis() - 5000, context,
           new JobManagerServiceCallback());
     } catch (IOException e) {
       LOG.warn("can not broadcast for job manager: " + e.getMessage(), e);
@@ -1809,7 +1813,7 @@ public void onParagraphRemove(Paragraph p) {
   @Override
   public void onNoteRemove(Note note, AuthenticationInfo subject) {
     try {
-      broadcastUpdateNoteJobInfo(System.currentTimeMillis() - 5000);
+      broadcastUpdateNoteJobInfo(note, System.currentTimeMillis() - 5000);
     } catch (IOException e) {
       LOG.warn("can not broadcast for job manager: " + e.getMessage(), e);
     }
@@ -1918,7 +1922,7 @@ public void onStatusChange(Paragraph p, Status before, Status after) {
     p.setStatusToUserParagraph(p.getStatus());
     broadcastParagraph(p.getNote(), p);
     try {
-      broadcastUpdateNoteJobInfo(System.currentTimeMillis() - 5000);
+      broadcastUpdateNoteJobInfo(p.getNote(), System.currentTimeMillis() - 5000);
     } catch (IOException e) {
       LOG.error("can not broadcast for job manager {}", e);
     }