add typecode 'u' 'c'

zasdfgbnm · zasdfgbnm · commit bd8e11176fb1 · 2017-06-30T22:14:30.000-04:00
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
@@ -2270,17 +2270,23 @@ def test_array_types(self):
         # and Scala types.
         # See: https://docs.python.org/2/library/array.html
 
-        int_types = set(['b', 'h', 'i', 'l'])
-        float_types = set(['f', 'd'])
-        unsupported_types = set(array.typecodes) - int_types - float_types
-
         def collected(a):
             row = Row(myarray=a)
             rdd = self.sc.parallelize([row])
             df = self.spark.createDataFrame(rdd)
             return df.collect()[0]["myarray"][0]
 
+        # test whether pyspark can correctly handle string types
+        string_types = set()
+        if sys.version < "4":
+            string_types += set(['u'])
+            self.assertEqual(collected(array.array('u', ["a"])), "a")
+        if sys.version < "3":
+            string_types += set(['c'])
+            self.assertEqual(collected(array.array('c', ["a"])), "a")
+
         # test whether pyspark can correctly handle int types
+        int_types = set(['b', 'h', 'i', 'l'])
         for t in int_types:
             # Start from 1 and keep doubling the number until overflow.
             a = array.array(t, [1])
@@ -2300,6 +2306,7 @@ def collected(a):
                     break
 
         # test whether pyspark can correctly handle float types
+        float_types = set(['f', 'd'])
         for t in float_types:
             # test upper bound and precision
             a = array.array(t, [1.0])
@@ -2312,6 +2319,18 @@ def collected(a):
             while a[0] != 0:
                 self.assertEqual(collected(a), a[0])
                 a[0] /= 2
+
+        # make sure that the test case cover all supported types
+        supported_types = int_types + float_types + string_types
+        self.assertEqual(supported_types, _array_type_mappings.keys)
+
+        all_type_codes = set()
+        if sys.version < "3":
+            all_type_codes += set([ 'c','b','B','u','h','H','i','I','l','L','f','d' ])
+        else:
+            all_type_codes += set(array.typecodes)
+        unsupported_types = all_type_codes - supported_types
+
         # test whether pyspark can correctly handle unsupported types
         for t in unsupported_types:
             try:
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
@@ -949,6 +949,17 @@ def _parse_datatype_json_value(json_value):
         long: LongType,
     })
 
+# Type code 'u' in Python's array is deprecated since version 3.3, and will be
+# removed in version 4.0. See: https://docs.python.org/3/library/array.html
+if sys.version < "4":
+    _array_type_mappings.update({
+        'u': StringType
+    })
+
+if sys.version < "3":
+    _array_type_mappings.update({
+        'c': StringType
+    })
 
 def _infer_type(obj):
     """Infer the DataType from obj