fix: fix some bugs in llm calling (#217)

WinstonLiyt · web-flow · commit 7b010f8b5940 · 2024-08-21T16:48:09.000+08:00
* fix some bugs in llm calling

* fix a CI error

* fix a small bug in kmeans group number choosing

* fix a ci bug
diff --git a/rdagent/core/conf.py b/rdagent/core/conf.py
@@ -98,6 +98,7 @@ class RDAgentSettings(BaseSettings):
     # factor extraction conf
     max_input_duplicate_factor_group: int = 300
     max_output_duplicate_factor_group: int = 20
+    max_kmeans_group_number: int = 40
 
     # workspace conf
     workspace_path: Path = Path.cwd() / "git_ignore_folder" / "RD-Agent_workspace"
diff --git a/rdagent/scenarios/qlib/factor_experiment_loader/pdf_loader.py b/rdagent/scenarios/qlib/factor_experiment_loader/pdf_loader.py
@@ -363,29 +363,49 @@ def check_factor_viability(
 def __check_factor_duplication_simulate_json_mode(
     factor_df: pd.DataFrame,
 ) -> list[list[str]]:
-    session = APIBackend().build_chat_session(
-        session_system_prompt=document_process_prompts["factor_duplicate_system"],
-    )
     current_user_prompt = factor_df.to_string()
 
+    working_list = [factor_df]
+    final_list = []
+
+    while len(working_list) > 0:
+        current_df = working_list.pop(0)
+        if (
+            APIBackend().build_messages_and_calculate_token(
+                user_prompt=current_df.to_string(), system_prompt=document_process_prompts["factor_duplicate_system"]
+            )
+            > RD_AGENT_SETTINGS.chat_token_limit
+        ):
+            working_list.append(current_df.iloc[: current_df.shape[0] // 2, :])
+            working_list.append(current_df.iloc[current_df.shape[0] // 2 :, :])
+        else:
+            final_list.append(current_df)
+
     generated_duplicated_groups = []
-    for _ in range(10):
-        extract_result_resp = session.build_chat_completion(
-            user_prompt=current_user_prompt,
-            json_mode=True,
+    for current_df in final_list:
+        current_factor_to_string = current_df.to_string()
+        session = APIBackend().build_chat_session(
+            session_system_prompt=document_process_prompts["factor_duplicate_system"],
         )
-        ret_dict = json.loads(extract_result_resp)
-        if len(ret_dict) == 0:
-            return generated_duplicated_groups
-        else:
-            generated_duplicated_groups.extend(ret_dict)
-            current_user_prompt = """Continue to extract duplicated groups. If no more duplicated group found please respond empty dict."""
+        for _ in range(10):
+            extract_result_resp = session.build_chat_completion(
+                user_prompt=current_factor_to_string,
+                json_mode=True,
+            )
+            ret_dict = json.loads(extract_result_resp)
+            if len(ret_dict) == 0:
+                return generated_duplicated_groups
+            else:
+                generated_duplicated_groups.extend(ret_dict)
+                current_factor_to_string = """Continue to extract duplicated groups. If no more duplicated group found please respond empty dict."""
     return generated_duplicated_groups
 
 
 def __kmeans_embeddings(embeddings: np.ndarray, k: int = 20) -> list[list[str]]:
     x_normalized = normalize(embeddings)
 
+    np.random.seed(42)
+
     kmeans = KMeans(
         n_clusters=k,
         init="random",
@@ -468,7 +488,7 @@ def __deduplicate_factor_dict(factor_dict: dict[str, dict[str, str]]) -> list[li
     else:
         for k in range(
             len(full_str_list) // RD_AGENT_SETTINGS.max_input_duplicate_factor_group,
-            40,
+            RD_AGENT_SETTINGS.max_kmeans_group_number,
         ):
             kmeans_index_group = __kmeans_embeddings(embeddings=embeddings, k=k)
             if len(kmeans_index_group[0]) < RD_AGENT_SETTINGS.max_input_duplicate_factor_group:
@@ -479,26 +499,22 @@ def __deduplicate_factor_dict(factor_dict: dict[str, dict[str, str]]) -> list[li
 
     duplication_names_list = []
 
-    pool = mp.Pool(target_k)
-    result_list = [
-        pool.apply_async(
-            __check_factor_duplication_simulate_json_mode,
-            (factor_df.loc[factor_name_group, :],),
-        )
-        for factor_name_group in factor_name_groups
-    ]
+    result_list = multiprocessing_wrapper(
+        [
+            (__check_factor_duplication_simulate_json_mode, (factor_df.loc[factor_name_group, :],))
+            for factor_name_group in factor_name_groups
+        ],
+        n=RD_AGENT_SETTINGS.multi_proc_n,
+    )
 
-    pool.close()
-    pool.join()
+    duplication_names_list = []
 
-    for result in result_list:
-        deduplication_factor_names_list = result.get()
-        for deduplication_factor_names in deduplication_factor_names_list:
-            filter_factor_names = [
-                factor_name for factor_name in set(deduplication_factor_names) if factor_name in factor_dict
-            ]
-            if len(filter_factor_names) > 1:
-                duplication_names_list.append(filter_factor_names)
+    for deduplication_factor_names_list in result_list:
+        filter_factor_names = [
+            factor_name for factor_name in set(deduplication_factor_names_list) if factor_name in factor_dict
+        ]
+        if len(filter_factor_names) > 1:
+            duplication_names_list.append(filter_factor_names)
 
     return duplication_names_list
 
@@ -509,6 +525,8 @@ def deduplicate_factors_by_llm(  # noqa: C901, PLR0912
 ) -> list[list[str]]:
     final_duplication_names_list = []
     current_round_factor_dict = factor_dict
+
+    # handle multi-round deduplication
     for _ in range(10):
         duplication_names_list = __deduplicate_factor_dict(current_round_factor_dict)
 
@@ -524,11 +542,13 @@ def deduplicate_factors_by_llm(  # noqa: C901, PLR0912
         else:
             break
 
+    # sort the final list of duplicates by their length, largest first
     final_duplication_names_list = sorted(final_duplication_names_list, key=lambda x: len(x), reverse=True)
 
-    to_replace_dict = {}
+    to_replace_dict = {}  # to map duplicates to the target factor names
     for duplication_names in duplication_names_list:
         if factor_viability_dict is not None:
+            # check viability of each factor in the duplicates group
             viability_list = [factor_viability_dict[name]["viability"] for name in duplication_names]
             if True not in viability_list:
                 continue
@@ -543,6 +563,7 @@ def deduplicate_factors_by_llm(  # noqa: C901, PLR0912
     llm_deduplicated_factor_dict = {}
     added_lower_name_set = set()
     for factor_name in factor_dict:
+        # only add factors that haven't been replaced and are not duplicates
         if factor_name not in to_replace_dict and factor_name.lower() not in added_lower_name_set:
             if factor_viability_dict is not None and not factor_viability_dict[factor_name]["viability"]:
                 continue