Skip to content

Commit 141039b

Browse files
fix: add hf causal LM python tests, fix build (#2374)
* add hf causal python test * add chat template example * move causallm test * fix typo * use phi4 for hf causal lm test * test db run * test gpu run * test onnx * test run * test run * test run * test run * force reinstall protobuf * test run * test run * test run * test remove onnxmltools from adb * test run * remove install library when submit run * test run * test run * update submit run * add back dependency
1 parent 6c95bf0 commit 141039b

File tree

6 files changed

+135
-16
lines changed

6 files changed

+135
-16
lines changed

core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/DatabricksUtilities.scala

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -279,8 +279,7 @@ object DatabricksUtilities {
279279
| "notebook_task": {
280280
| "notebook_path": "$notebookPath",
281281
| "base_parameters": []
282-
| },
283-
| "libraries": $Libraries
282+
| }
284283
|}
285284
""".stripMargin
286285
databricksPost("jobs/runs/submit", body).select[Long]("run_id")
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
# Copyright (C) Microsoft Corporation. All rights reserved.
2+
# Licensed under the MIT License. See LICENSE in project root for information.
3+
4+
# Prepare training and test data.
5+
6+
import unittest
7+
from synapse.ml.llm.HuggingFaceCausallmTransform import HuggingFaceCausalLM
8+
from pyspark.sql import SQLContext
9+
from synapse.ml.core.init_spark import *
10+
from synapse.ml.core import __spark_package_version__
11+
12+
spark = init_spark()
13+
sc = SQLContext(spark.sparkContext)
14+
15+
16+
class HuggingFaceCausalLMTester(unittest.TestCase):
17+
def __init__(self, *args, **kwargs):
18+
super(HuggingFaceCausalLMTester, self).__init__(*args, **kwargs)
19+
self.transformer = (
20+
HuggingFaceCausalLM()
21+
.setModelName("Qwen/Qwen2.5-0.5B-Instruct")
22+
.setInputCol("messages")
23+
.setOutputCol("result")
24+
.setModelParam(max_new_tokens=10)
25+
)
26+
self.strDataFrame = (
27+
spark.createDataFrame(
28+
[
29+
(
30+
"positive",
31+
"output a single word (without quotes) of positive or negative in lower case to reflect their sentiment: I like SynapseML",
32+
),
33+
]
34+
)
35+
.toDF("gt", "messages")
36+
.repartition(1)
37+
)
38+
self.listDataFrame = (
39+
spark.createDataFrame(
40+
[
41+
(
42+
"positive",
43+
[
44+
{
45+
"role": "system",
46+
"content": "Your job is to detect the sentiment of user reviews. Given some text, output a single word (without quotes) of positive or negative to reflect their intent. Output only that single word in lower case: no explanations or complete sentences.",
47+
},
48+
{"role": "user", "content": "I like SynapseML"},
49+
],
50+
),
51+
]
52+
)
53+
.toDF("gt", "messages")
54+
.repartition(1)
55+
)
56+
57+
def _assert_output(self, transformer, input_df):
58+
transformed_df = transformer.transform(input_df).collect()
59+
gt_col_value = [row.gt for row in transformed_df]
60+
output_col_value = [row.result for row in transformed_df]
61+
input_col_value = [row.messages for row in transformed_df]
62+
for i in range(len(gt_col_value)):
63+
assert (
64+
gt_col_value[i] == output_col_value[i]
65+
), f"model prediction {output_col_value[i]} does not match with ground truth {gt_col_value[i]}, input message is {input_col_value[i]}"
66+
67+
def test_str_df(self):
68+
self._assert_output(self.transformer, self.strDataFrame)
69+
70+
def test_list_df(self):
71+
self._assert_output(self.transformer, self.listDataFrame)
72+
73+
74+
if __name__ == "__main__":
75+
result = unittest.main()

docs/Explore Algorithms/Deep Learning/Quickstart - Apply Phi Model with HuggingFace CausalLM.ipynb

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,50 @@
8282
"display(result_df)"
8383
]
8484
},
85+
{
86+
"cell_type": "markdown",
87+
"metadata": {},
88+
"source": [
89+
"## Apply Chat Template"
90+
]
91+
},
92+
{
93+
"cell_type": "code",
94+
"execution_count": null,
95+
"metadata": {},
96+
"outputs": [],
97+
"source": [
98+
"from pyspark.sql.functions import udf\n",
99+
"from pyspark.sql.types import ArrayType, MapType, StringType\n",
100+
"\n",
101+
"reviews = [\n",
102+
" (1, \"I like SynapseML\"),\n",
103+
" (2, \"Contoso is awful\"),\n",
104+
"]\n",
105+
"reviews_df = spark.createDataFrame(reviews, [\"row_index\", \"content\"])\n",
106+
"\n",
107+
"PROMPT_1 = f\"\"\"You are an AI assistant that identifies the sentiment of a given text. Respond with only the single word “positive” or “negative.”\n",
108+
" \"\"\"\n",
109+
"\n",
110+
"\n",
111+
"@udf\n",
112+
"def make_template(s: str):\n",
113+
" return [{\"role\": \"system\", \"content\": PROMPT_1}, {\"role\": \"user\", \"content\": s}]\n",
114+
"\n",
115+
"\n",
116+
"reviews_df = reviews_df.withColumn(\"messages\", make_template(\"content\"))\n",
117+
"\n",
118+
"phi3_transformer = (\n",
119+
" HuggingFaceCausalLM()\n",
120+
" .setModelName(\"microsoft/Phi-3-mini-4k-instruct\")\n",
121+
" .setInputCol(\"messages\")\n",
122+
" .setOutputCol(\"result\")\n",
123+
" .setModelParam(max_new_tokens=10)\n",
124+
")\n",
125+
"result_df = phi3_transformer.transform(reviews_df).collect()\n",
126+
"display(result_df)"
127+
]
128+
},
85129
{
86130
"cell_type": "markdown",
87131
"metadata": {},

docs/Explore Algorithms/Deep Learning/Quickstart - Fine-tune a Text Classifier.ipynb

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -16,24 +16,24 @@
1616
},
1717
{
1818
"cell_type": "markdown",
19-
"source": [
20-
"### Environment Setup on databricks"
21-
],
2219
"metadata": {
2320
"collapsed": false
24-
}
21+
},
22+
"source": [
23+
"### Environment Setup on databricks"
24+
]
2525
},
2626
{
2727
"cell_type": "code",
2828
"execution_count": null,
29+
"metadata": {
30+
"collapsed": false
31+
},
2932
"outputs": [],
3033
"source": [
3134
"# install cloudpickle 2.0.0 to add synapse module for usage of horovod\n",
3235
"%pip install cloudpickle==2.0.0 --force-reinstall --no-deps"
33-
],
34-
"metadata": {
35-
"collapsed": false
36-
}
36+
]
3737
},
3838
{
3939
"cell_type": "code",

docs/Explore Algorithms/Deep Learning/Quickstart - Fine-tune a Vision Classifier.ipynb

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,9 @@
7070
{
7171
"cell_type": "code",
7272
"execution_count": null,
73+
"metadata": {
74+
"collapsed": false
75+
},
7376
"outputs": [],
7477
"source": [
7578
"folder_path = \"/tmp/flowers_prepped\"\n",
@@ -81,10 +84,7 @@
8184
" with zipfile.ZipFile(zip_path, \"r\") as zip_ref:\n",
8285
" zip_ref.extractall(\"/dbfs/tmp\")\n",
8386
" os.remove(zip_path)"
84-
],
85-
"metadata": {
86-
"collapsed": false
87-
}
87+
]
8888
},
8989
{
9090
"cell_type": "code",

docs/Explore Algorithms/Deep Learning/Quickstart - ONNX Model Inference.ipynb

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,8 @@
1313
"This example uses the following Python packages and versions:\n",
1414
"\n",
1515
"- `onnxmltools==1.7.0`\n",
16-
"- `lightgbm==3.2.1`\n"
16+
"- `lightgbm==3.2.1`\n",
17+
"- `onnx==1.17.0`"
1718
]
1819
},
1920
{
@@ -35,7 +36,7 @@
3536
},
3637
"outputs": [],
3738
"source": [
38-
"%pip install lightgbm onnxmltools==1.7.0"
39+
"%pip install --no-cache-dir lightgbm onnxmltools==1.7.0 onnx==1.17.0"
3940
]
4041
},
4142
{

0 commit comments

Comments
 (0)