Skip to content

Commit 1e3d38b

Browse files
authored
fix: change css tag of kaggle competition info crawler (#306)
* change css tag of kaggle competition info crawler * fix CI
1 parent f663cf4 commit 1e3d38b

File tree

1 file changed

+54
-4
lines changed

1 file changed

+54
-4
lines changed

rdagent/scenarios/kaggle/kaggle_crawler.py

Lines changed: 54 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ def crawl_descriptions(competition: str, wait: float = 3.0, force: bool = False)
4343

4444
# Get main contents
4545
contents = []
46-
elements = site_body.find_elements(By.CSS_SELECTOR, ".sc-iWlrxG.cMAZdc")
46+
elements = site_body.find_elements(By.CSS_SELECTOR, ".fbHzUd")
4747
for e in elements:
4848
content = e.get_attribute("innerHTML")
4949
contents.append(content)
@@ -53,14 +53,14 @@ def crawl_descriptions(competition: str, wait: float = 3.0, force: bool = False)
5353
descriptions[subtitles[i]] = contents[i]
5454

5555
# Get the citation
56-
element = site_body.find_element(By.CSS_SELECTOR, ".sc-ifyrTC.sc-fyziuY")
56+
element = site_body.find_element(By.CSS_SELECTOR, ".bZEXEC")
5757
citation = element.get_attribute("innerHTML")
5858
descriptions[subtitles[-1]] = citation
5959

6060
data_url = f"https://www.kaggle.com/competitions/{competition}/data"
6161
driver.get(data_url)
6262
time.sleep(wait)
63-
data_element = driver.find_element(By.CSS_SELECTOR, ".sc-iWlrxG.cMAZdc")
63+
data_element = driver.find_element(By.CSS_SELECTOR, ".fbHzUd")
6464
descriptions["Data Description"] = data_element.get_attribute("innerHTML")
6565

6666
driver.quit()
@@ -80,7 +80,57 @@ def download_data(competition: str, local_path: str = "/data/userdata/share/kagg
8080

8181

8282
if __name__ == "__main__":
83-
download_data("feedback-prize-english-language-learning", "/data/userdata/share/kaggle")
83+
dsagent_cs = [
84+
"feedback-prize-english-language-learning",
85+
"playground-series-s3e11",
86+
"playground-series-s3e14",
87+
"spaceship-titanic",
88+
"playground-series-s3e18",
89+
"playground-series-s3e16",
90+
"playground-series-s3e9",
91+
"playground-series-s3e25",
92+
"playground-series-s3e26",
93+
"playground-series-s3e24",
94+
"playground-series-s3e23",
95+
]
96+
97+
other_cs = [
98+
"amp-parkinsons-disease-progression-prediction",
99+
"arc-prize-2024",
100+
"ariel-data-challenge-2024",
101+
"child-mind-institute-detect-sleep-states",
102+
"connectx",
103+
"contradictory-my-dear-watson",
104+
"digit-recognizer",
105+
"fathomnet-out-of-sample-detection",
106+
"forest-cover-type-prediction",
107+
"gan-getting-started",
108+
"google-research-identify-contrails-reduce-global-warming",
109+
"house-prices-advanced-regression-techniques",
110+
"isic-2024-challenge",
111+
"leash-BELKA",
112+
"llm-20-questions",
113+
"nlp-getting-started",
114+
"playground-series-s4e1",
115+
"playground-series-s4e2",
116+
"playground-series-s4e3",
117+
"playground-series-s4e4",
118+
"playground-series-s4e5",
119+
"playground-series-s4e6",
120+
"playground-series-s4e7",
121+
"playground-series-s4e8",
122+
"rsna-2024-lumbar-spine-degenerative-classification",
123+
"sf-crime",
124+
"store-sales-time-series-forecasting",
125+
"titanic",
126+
"tpu-getting-started",
127+
"covid19-global-forecasting-week-1",
128+
"birdsong-recognition",
129+
"optiver-trading-at-the-close",
130+
]
131+
132+
for i in dsagent_cs + other_cs:
133+
crawl_descriptions(i)
84134
exit()
85135
from kaggle.api.kaggle_api_extended import KaggleApi
86136

0 commit comments

Comments
 (0)