{"id":1131910,"date":"2025-01-08T20:52:31","date_gmt":"2025-01-08T12:52:31","guid":{"rendered":"https:\/\/docs.pingcode.com\/ask\/ask-ask\/1131910.html"},"modified":"2025-01-08T20:52:34","modified_gmt":"2025-01-08T12:52:34","slug":"python%e5%a6%82%e4%bd%95%e4%b8%80%e6%ac%a1%e8%bf%90%e8%a1%8c%e5%a5%bd%e5%a4%9a%e6%95%b0%e6%8d%ae","status":"publish","type":"post","link":"https:\/\/docs.pingcode.com\/ask\/1131910.html","title":{"rendered":"python\u5982\u4f55\u4e00\u6b21\u8fd0\u884c\u597d\u591a\u6570\u636e"},"content":{"rendered":"<p style=\"text-align:center;\" ><img decoding=\"async\" src=\"https:\/\/cdn-kb.worktile.com\/kb\/wp-content\/uploads\/2024\/04\/25101537\/4aa97c3c-a660-4223-8904-fb196fb41ea3.webp\" alt=\"python\u5982\u4f55\u4e00\u6b21\u8fd0\u884c\u597d\u591a\u6570\u636e\" \/><\/p>\n<p><p> <strong>Python\u5982\u4f55\u4e00\u6b21\u8fd0\u884c\u597d\u591a\u6570\u636e<\/strong><\/p>\n<\/p>\n<p><p><strong>Python\u4e00\u6b21\u8fd0\u884c\u5927\u91cf\u6570\u636e\u7684\u65b9\u6cd5\u4e3b\u8981\u5305\u62ec\uff1a\u6279\u91cf\u5904\u7406\u3001\u5e76\u884c\u8ba1\u7b97\u3001\u591a\u7ebf\u7a0b\u548c\u591a\u8fdb\u7a0b\u3001\u4f7f\u7528\u9002\u5408\u5927\u6570\u636e\u5904\u7406\u7684\u5e93\u5982Pandas\u548cNumPy\u3002<\/strong> \u5176\u4e2d\uff0c<strong>\u6279\u91cf\u5904\u7406<\/strong> \u662f\u5904\u7406\u5927\u91cf\u6570\u636e\u65f6\u6700\u5e38\u7528\u7684\u65b9\u6cd5\u4e4b\u4e00\u3002\u901a\u8fc7\u5c06\u6570\u636e\u5212\u5206\u4e3a\u591a\u4e2a\u5c0f\u6279\u6b21\uff0c\u9010\u4e2a\u5904\u7406\u8fd9\u4e9b\u5c0f\u6279\u6b21\uff0c\u53ef\u4ee5\u63d0\u9ad8\u7a0b\u5e8f\u7684\u6548\u7387\u548c\u7a33\u5b9a\u6027\u3002\u4e0b\u9762\u5c06\u8be6\u7ec6\u63cf\u8ff0\u5982\u4f55\u4f7f\u7528\u6279\u91cf\u5904\u7406\u6280\u672f\u3002<\/p>\n<\/p>\n<p><p>\u6279\u91cf\u5904\u7406\u662f\u4e00\u79cd\u5e38\u7528\u7684\u6570\u636e\u5904\u7406\u65b9\u6cd5\uff0c\u5c24\u5176\u9002\u7528\u4e8e\u6570\u636e\u91cf\u8f83\u5927\u7684\u573a\u666f\u3002\u901a\u8fc7\u5c06\u5927\u6570\u636e\u96c6\u5206\u5272\u6210\u8f83\u5c0f\u7684\u6279\u6b21\uff0c\u9010\u4e2a\u5904\u7406\u8fd9\u4e9b\u6279\u6b21\uff0c\u53ef\u4ee5\u907f\u514d\u5185\u5b58\u6ea2\u51fa\u95ee\u9898\uff0c\u5e76\u4e14\u4f7f\u7a0b\u5e8f\u66f4\u6613\u4e8e\u7ba1\u7406\u548c\u8c03\u8bd5\u3002\u4f8b\u5982\uff0c\u5728\u5904\u7406\u767e\u4e07\u7ea7\u522b\u7684\u6570\u636e\u65f6\uff0c\u53ef\u4ee5\u5c06\u6570\u636e\u5206\u5272\u6210\u6bcf\u6279\u5904\u7406\u4e00\u4e07\u6761\u8bb0\u5f55\uff0c\u9010\u6279\u5904\u7406\u5e76\u5408\u5e76\u7ed3\u679c\u3002\u5177\u4f53\u5b9e\u73b0\u53ef\u4ee5\u5229\u7528Python\u4e2d\u7684\u751f\u6210\u5668\u548c\u8fed\u4ee3\u5668\uff0c\u9010\u4e2a\u8bfb\u53d6\u6570\u636e\u5e76\u8fdb\u884c\u5904\u7406\uff0c\u4ece\u800c\u63d0\u9ad8\u5904\u7406\u6548\u7387\u3002<\/p>\n<\/p>\n<p><h3>\u4e00\u3001\u6279\u91cf\u5904\u7406<\/h3>\n<\/p>\n<p><p>\u6279\u91cf\u5904\u7406\u662f\u6307\u5c06\u5927\u6570\u636e\u96c6\u5206\u6210\u82e5\u5e72\u5c0f\u6279\u6b21\uff0c\u9010\u4e2a\u5904\u7406\u8fd9\u4e9b\u5c0f\u6279\u6b21\u3002\u8fd9\u6837\u53ef\u4ee5\u907f\u514d\u4e00\u6b21\u6027\u52a0\u8f7d\u6240\u6709\u6570\u636e\u5bfc\u81f4\u5185\u5b58\u6ea2\u51fa\u7684\u95ee\u9898\uff0c\u4e5f\u53ef\u4ee5\u4f7f\u5904\u7406\u8fc7\u7a0b\u66f4\u52a0\u9ad8\u6548\u3002\u4ee5\u4e0b\u662f\u4f7f\u7528Python\u8fdb\u884c\u6279\u91cf\u5904\u7406\u7684\u51e0\u79cd\u5e38\u89c1\u65b9\u6cd5\u3002<\/p>\n<\/p>\n<p><h4>1.1 \u4f7f\u7528\u751f\u6210\u5668<\/h4>\n<\/p>\n<p><p>\u751f\u6210\u5668\u662f\u4e00\u79cd\u7279\u6b8a\u7684\u8fed\u4ee3\u5668\uff0c\u53ef\u4ee5\u9010\u4e2a\u751f\u6210\u6570\u636e\uff0c\u800c\u4e0d\u662f\u4e00\u6b21\u6027\u5c06\u6240\u6709\u6570\u636e\u52a0\u8f7d\u5230\u5185\u5b58\u4e2d\u3002\u4f7f\u7528\u751f\u6210\u5668\u53ef\u4ee5\u6709\u6548\u5730\u8fdb\u884c\u6279\u91cf\u5904\u7406\u3002<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">def data_generator(data, batch_size):<\/p>\n<p>    for i in range(0, len(data), batch_size):<\/p>\n<p>        yield data[i:i + batch_size]<\/p>\n<h2><strong>\u793a\u4f8b\u4f7f\u7528<\/strong><\/h2>\n<p>data = range(1000000)  # \u5047\u8bbe\u6709\u4e00\u767e\u4e07\u6761\u6570\u636e<\/p>\n<p>batch_size = 10000<\/p>\n<p>for batch in data_generator(data, batch_size):<\/p>\n<p>    # \u5728\u8fd9\u91cc\u5904\u7406\u6bcf\u4e2a\u6279\u6b21\u7684\u6570\u636e<\/p>\n<p>    print(len(batch))  # \u6bcf\u6b21\u5904\u7406\u4e00\u4e07\u6761\u6570\u636e<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h4>1.2 \u4f7f\u7528Pandas\u7684<code>chunksize<\/code>\u53c2\u6570<\/h4>\n<\/p>\n<p><p>Pandas\u662f\u4e00\u4e2a\u5f3a\u5927\u7684\u6570\u636e\u5904\u7406\u5e93\uff0c\u5b83\u7684<code>read_csv<\/code>\u51fd\u6570\u53ef\u4ee5\u4f7f\u7528<code>chunksize<\/code>\u53c2\u6570\u5206\u5757\u8bfb\u53d6\u6570\u636e\u3002<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">import pandas as pd<\/p>\n<h2><strong>\u5047\u8bbe\u6709\u4e00\u4e2a\u5f88\u5927\u7684CSV\u6587\u4ef6<\/strong><\/h2>\n<p>file_path = &#39;large_dataset.csv&#39;<\/p>\n<p>chunk_size = 10000<\/p>\n<h2><strong>\u4f7f\u7528chunksize\u53c2\u6570\u5206\u5757\u8bfb\u53d6<\/strong><\/h2>\n<p>for chunk in pd.read_csv(file_path, chunksize=chunk_size):<\/p>\n<p>    # \u5728\u8fd9\u91cc\u5904\u7406\u6bcf\u4e2a\u5757\u7684\u6570\u636e<\/p>\n<p>    print(chunk.shape)  # \u6bcf\u6b21\u5904\u7406\u4e00\u4e07\u6761\u6570\u636e<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h3>\u4e8c\u3001\u5e76\u884c\u8ba1\u7b97<\/h3>\n<\/p>\n<p><p>\u5e76\u884c\u8ba1\u7b97\u662f\u6307\u540c\u65f6\u6267\u884c\u591a\u4e2a\u8ba1\u7b97\u4efb\u52a1\uff0c\u4ee5\u63d0\u9ad8\u8ba1\u7b97\u6548\u7387\u3002Python\u652f\u6301\u591a\u7ebf\u7a0b\u548c\u591a\u8fdb\u7a0b\u7f16\u7a0b\uff0c\u53ef\u4ee5\u6709\u6548\u5730\u8fdb\u884c\u5e76\u884c\u8ba1\u7b97\u3002<\/p>\n<\/p>\n<p><h4>2.1 \u591a\u7ebf\u7a0b<\/h4>\n<\/p>\n<p><p>\u591a\u7ebf\u7a0b\u9002\u7528\u4e8eI\/O\u5bc6\u96c6\u578b\u4efb\u52a1\uff0c\u4f8b\u5982\u6587\u4ef6\u8bfb\u53d6\u3001\u7f51\u7edc\u8bf7\u6c42\u7b49\u3002Python\u7684<code>threading<\/code>\u6a21\u5757\u53ef\u4ee5\u7528\u4e8e\u591a\u7ebf\u7a0b\u7f16\u7a0b\u3002<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">import threading<\/p>\n<p>def process_data(data):<\/p>\n<p>    # \u5728\u8fd9\u91cc\u5904\u7406\u6570\u636e<\/p>\n<p>    pass<\/p>\n<p>data = range(1000000)<\/p>\n<p>batch_size = 10000<\/p>\n<p>threads = []<\/p>\n<p>for i in range(0, len(data), batch_size):<\/p>\n<p>    batch = data[i:i + batch_size]<\/p>\n<p>    thread = threading.Thread(target=process_data, args=(batch,))<\/p>\n<p>    threads.append(thread)<\/p>\n<p>    thread.start()<\/p>\n<h2><strong>\u7b49\u5f85\u6240\u6709\u7ebf\u7a0b\u5b8c\u6210<\/strong><\/h2>\n<p>for thread in threads:<\/p>\n<p>    thread.join()<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h4>2.2 \u591a\u8fdb\u7a0b<\/h4>\n<\/p>\n<p><p>\u591a\u8fdb\u7a0b\u9002\u7528\u4e8eCPU\u5bc6\u96c6\u578b\u4efb\u52a1\uff0c\u4f8b\u5982\u5927\u89c4\u6a21\u8ba1\u7b97\u3002Python\u7684<code>multiprocessing<\/code>\u6a21\u5757\u53ef\u4ee5\u7528\u4e8e\u591a\u8fdb\u7a0b\u7f16\u7a0b\u3002<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">import multiprocessing<\/p>\n<p>def process_data(data):<\/p>\n<p>    # \u5728\u8fd9\u91cc\u5904\u7406\u6570\u636e<\/p>\n<p>    pass<\/p>\n<p>data = range(1000000)<\/p>\n<p>batch_size = 10000<\/p>\n<p>processes = []<\/p>\n<p>for i in range(0, len(data), batch_size):<\/p>\n<p>    batch = data[i:i + batch_size]<\/p>\n<p>    process = multiprocessing.Process(target=process_data, args=(batch,))<\/p>\n<p>    processes.append(process)<\/p>\n<p>    process.start()<\/p>\n<h2><strong>\u7b49\u5f85\u6240\u6709\u8fdb\u7a0b\u5b8c\u6210<\/strong><\/h2>\n<p>for process in processes:<\/p>\n<p>    process.join()<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h3>\u4e09\u3001\u4f7f\u7528\u9002\u5408\u5927\u6570\u636e\u5904\u7406\u7684\u5e93<\/h3>\n<\/p>\n<p><p>Python\u6709\u8bb8\u591a\u4e13\u95e8\u7528\u4e8e\u5927\u6570\u636e\u5904\u7406\u7684\u5e93\uff0c\u4f8b\u5982Pandas\u3001NumPy\u3001Dask\u7b49\u3002\u5b83\u4eec\u63d0\u4f9b\u4e86\u9ad8\u6548\u7684\u6570\u636e\u5904\u7406\u65b9\u6cd5\uff0c\u53ef\u4ee5\u4e00\u6b21\u6027\u5904\u7406\u5927\u91cf\u6570\u636e\u3002<\/p>\n<\/p>\n<p><h4>3.1 Pandas<\/h4>\n<\/p>\n<p><p>Pandas\u662f\u4e00\u4e2a\u5f3a\u5927\u7684\u6570\u636e\u5904\u7406\u5e93\uff0c\u9002\u7528\u4e8e\u7ed3\u6784\u5316\u6570\u636e\u7684\u5904\u7406\u3002\u5b83\u63d0\u4f9b\u4e86\u9ad8\u6548\u7684\u6570\u636e\u64cd\u4f5c\u65b9\u6cd5\uff0c\u53ef\u4ee5\u8f7b\u677e\u5904\u7406\u767e\u4e07\u7ea7\u522b\u7684\u6570\u636e\u3002<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">import pandas as pd<\/p>\n<h2><strong>\u5047\u8bbe\u6709\u4e00\u4e2a\u5f88\u5927\u7684CSV\u6587\u4ef6<\/strong><\/h2>\n<p>file_path = &#39;large_dataset.csv&#39;<\/p>\n<h2><strong>\u8bfb\u53d6\u6574\u4e2a\u6570\u636e\u96c6<\/strong><\/h2>\n<p>data = pd.read_csv(file_path)<\/p>\n<h2><strong>\u6570\u636e\u5904\u7406<\/strong><\/h2>\n<p>data[&#39;new_column&#39;] = data[&#39;existing_column&#39;].apply(lambda x: x * 2)<\/p>\n<p>print(data.head())<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h4>3.2 NumPy<\/h4>\n<\/p>\n<p><p>NumPy\u662f\u4e00\u4e2a\u7528\u4e8e\u79d1\u5b66\u8ba1\u7b97\u7684\u5e93\uff0c\u63d0\u4f9b\u4e86\u9ad8\u6548\u7684\u6570\u7ec4\u64cd\u4f5c\u65b9\u6cd5\u3002\u5b83\u9002\u7528\u4e8e\u6570\u503c\u6570\u636e\u7684\u5904\u7406\uff0c\u53ef\u4ee5\u8fdb\u884c\u5feb\u901f\u7684\u77e9\u9635\u8fd0\u7b97\u3002<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">import numpy as np<\/p>\n<h2><strong>\u5047\u8bbe\u6709\u4e00\u4e2a\u5f88\u5927\u7684\u6570\u7ec4<\/strong><\/h2>\n<p>data = np.random.rand(1000000)<\/p>\n<h2><strong>\u6570\u636e\u5904\u7406<\/strong><\/h2>\n<p>result = data * 2<\/p>\n<p>print(result[:5])<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h4>3.3 Dask<\/h4>\n<\/p>\n<p><p>Dask\u662f\u4e00\u4e2a\u5e76\u884c\u8ba1\u7b97\u5e93\uff0c\u4e13\u95e8\u7528\u4e8e\u5904\u7406\u5927\u89c4\u6a21\u6570\u636e\u3002\u5b83\u53ef\u4ee5\u5c06\u5927\u6570\u636e\u96c6\u5206\u6210\u5c0f\u5757\uff0c\u8fdb\u884c\u5e76\u884c\u5904\u7406\u3002<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">import dask.dataframe as dd<\/p>\n<h2><strong>\u5047\u8bbe\u6709\u4e00\u4e2a\u5f88\u5927\u7684CSV\u6587\u4ef6<\/strong><\/h2>\n<p>file_path = &#39;large_dataset.csv&#39;<\/p>\n<h2><strong>\u4f7f\u7528Dask\u8bfb\u53d6\u6570\u636e<\/strong><\/h2>\n<p>data = dd.read_csv(file_path)<\/p>\n<h2><strong>\u6570\u636e\u5904\u7406<\/strong><\/h2>\n<p>data[&#39;new_column&#39;] = data[&#39;existing_column&#39;] * 2<\/p>\n<p>result = data.compute()<\/p>\n<p>print(result.head())<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h3>\u56db\u3001\u6570\u636e\u6d41\u5904\u7406<\/h3>\n<\/p>\n<p><p>\u6570\u636e\u6d41\u5904\u7406\u662f\u4e00\u79cd\u5b9e\u65f6\u5904\u7406\u5927\u91cf\u6570\u636e\u7684\u6280\u672f\uff0c\u9002\u7528\u4e8e\u9700\u8981\u5b9e\u65f6\u54cd\u5e94\u7684\u6570\u636e\u5904\u7406\u573a\u666f\u3002Python\u6709\u591a\u4e2a\u6d41\u5904\u7406\u5e93\uff0c\u4f8b\u5982Apache Kafka\u3001Apache Flink\u7b49\u3002<\/p>\n<\/p>\n<p><h4>4.1 \u4f7f\u7528Apache Kafka<\/h4>\n<\/p>\n<p><p>Apache Kafka\u662f\u4e00\u4e2a\u9ad8\u541e\u5410\u91cf\u3001\u4f4e\u5ef6\u8fdf\u7684\u5206\u5e03\u5f0f\u6d41\u5904\u7406\u5e73\u53f0\uff0c\u9002\u7528\u4e8e\u5b9e\u65f6\u6570\u636e\u6d41\u7684\u5904\u7406\u3002<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">from kafka import KafkaConsumer<\/p>\n<h2><strong>\u521b\u5efa\u6d88\u8d39\u8005<\/strong><\/h2>\n<p>consumer = KafkaConsumer(&#39;my_topic&#39;, bootstrap_servers=[&#39;localhost:9092&#39;])<\/p>\n<h2><strong>\u5b9e\u65f6\u5904\u7406\u6570\u636e\u6d41<\/strong><\/h2>\n<p>for message in consumer:<\/p>\n<p>    data = message.value<\/p>\n<p>    # \u5728\u8fd9\u91cc\u5904\u7406\u6570\u636e<\/p>\n<p>    print(data)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h4>4.2 \u4f7f\u7528Apache Flink<\/h4>\n<\/p>\n<p><p>Apache Flink\u662f\u4e00\u4e2a\u5206\u5e03\u5f0f\u6d41\u5904\u7406\u6846\u67b6\uff0c\u9002\u7528\u4e8e\u5927\u89c4\u6a21\u6570\u636e\u6d41\u7684\u5904\u7406\u3002<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">from pyflink.datastream import StreamExecutionEnvironment<\/p>\n<h2><strong>\u521b\u5efa\u6267\u884c\u73af\u5883<\/strong><\/h2>\n<p>env = StreamExecutionEnvironment.get_execution_environment()<\/p>\n<h2><strong>\u521b\u5efa\u6570\u636e\u6d41<\/strong><\/h2>\n<p>data_stream = env.from_elements(1, 2, 3, 4, 5)<\/p>\n<h2><strong>\u6570\u636e\u5904\u7406<\/strong><\/h2>\n<p>data_stream.map(lambda x: x * 2).print()<\/p>\n<h2><strong>\u6267\u884c\u7a0b\u5e8f<\/strong><\/h2>\n<p>env.execute(&quot;data stream job&quot;)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h3>\u4e94\u3001\u4f7f\u7528\u6570\u636e\u5e93<\/h3>\n<\/p>\n<p><p>\u5f53\u6570\u636e\u91cf\u975e\u5e38\u5927\u65f6\uff0c\u53ef\u4ee5\u8003\u8651\u5c06\u6570\u636e\u5b58\u50a8\u5728\u6570\u636e\u5e93\u4e2d\uff0c\u5e76\u901a\u8fc7\u6570\u636e\u5e93\u67e5\u8be2\u8fdb\u884c\u6570\u636e\u5904\u7406\u3002Python\u652f\u6301\u591a\u79cd\u6570\u636e\u5e93\u8fde\u63a5\uff0c\u4f8b\u5982MySQL\u3001PostgreSQL\u3001MongoDB\u7b49\u3002<\/p>\n<\/p>\n<p><h4>5.1 \u4f7f\u7528MySQL<\/h4>\n<\/p>\n<p><p>MySQL\u662f\u4e00\u4e2a\u5e38\u7528\u7684\u5173\u7cfb\u578b\u6570\u636e\u5e93\uff0c\u9002\u7528\u4e8e\u7ed3\u6784\u5316\u6570\u636e\u7684\u5b58\u50a8\u548c\u67e5\u8be2\u3002<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">import mysql.connector<\/p>\n<h2><strong>\u8fde\u63a5\u6570\u636e\u5e93<\/strong><\/h2>\n<p>conn = mysql.connector.connect(<\/p>\n<p>    host=&#39;localhost&#39;,<\/p>\n<p>    user=&#39;user&#39;,<\/p>\n<p>    password=&#39;password&#39;,<\/p>\n<p>    database=&#39;database&#39;<\/p>\n<p>)<\/p>\n<h2><strong>\u521b\u5efa\u6e38\u6807<\/strong><\/h2>\n<p>cursor = conn.cursor()<\/p>\n<h2><strong>\u6267\u884c\u67e5\u8be2<\/strong><\/h2>\n<p>cursor.execute(&quot;SELECT * FROM my_table&quot;)<\/p>\n<h2><strong>\u5904\u7406\u67e5\u8be2\u7ed3\u679c<\/strong><\/h2>\n<p>for row in cursor.fetchall():<\/p>\n<p>    print(row)<\/p>\n<h2><strong>\u5173\u95ed\u8fde\u63a5<\/strong><\/h2>\n<p>conn.close()<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h4>5.2 \u4f7f\u7528MongoDB<\/h4>\n<\/p>\n<p><p>MongoDB\u662f\u4e00\u4e2a\u5e38\u7528\u7684NoSQL\u6570\u636e\u5e93\uff0c\u9002\u7528\u4e8e\u534a\u7ed3\u6784\u5316\u548c\u975e\u7ed3\u6784\u5316\u6570\u636e\u7684\u5b58\u50a8\u548c\u67e5\u8be2\u3002<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">from pymongo import MongoClient<\/p>\n<h2><strong>\u8fde\u63a5\u6570\u636e\u5e93<\/strong><\/h2>\n<p>client = MongoClient(&#39;localhost&#39;, 27017)<\/p>\n<p>db = client[&#39;database&#39;]<\/p>\n<h2><strong>\u6267\u884c\u67e5\u8be2<\/strong><\/h2>\n<p>collection = db[&#39;my_collection&#39;]<\/p>\n<p>for document in collection.find():<\/p>\n<p>    print(document)<\/p>\n<h2><strong>\u5173\u95ed\u8fde\u63a5<\/strong><\/h2>\n<p>client.close()<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h3>\u516d\u3001\u6570\u636e\u9884\u5904\u7406\u4e0e\u6e05\u6d17<\/h3>\n<\/p>\n<p><p>\u5728\u5904\u7406\u5927\u91cf\u6570\u636e\u4e4b\u524d\uff0c\u901a\u5e38\u9700\u8981\u8fdb\u884c\u6570\u636e\u9884\u5904\u7406\u4e0e\u6e05\u6d17\u3002\u8fd9\u5305\u62ec\u6570\u636e\u53bb\u91cd\u3001\u7f3a\u5931\u503c\u5904\u7406\u3001\u6570\u636e\u7c7b\u578b\u8f6c\u6362\u7b49\u3002Python\u63d0\u4f9b\u4e86\u591a\u79cd\u5de5\u5177\u548c\u5e93\u6765\u8fdb\u884c\u6570\u636e\u9884\u5904\u7406\u3002<\/p>\n<\/p>\n<p><h4>6.1 \u6570\u636e\u53bb\u91cd<\/h4>\n<\/p>\n<p><p>\u6570\u636e\u53bb\u91cd\u662f\u6307\u53bb\u9664\u6570\u636e\u96c6\u4e2d\u7684\u91cd\u590d\u8bb0\u5f55\u3002Pandas\u63d0\u4f9b\u4e86\u65b9\u4fbf\u7684\u65b9\u6cd5\u6765\u8fdb\u884c\u6570\u636e\u53bb\u91cd\u3002<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">import pandas as pd<\/p>\n<h2><strong>\u5047\u8bbe\u6709\u4e00\u4e2a\u6570\u636e\u96c6<\/strong><\/h2>\n<p>data = pd.DataFrame({<\/p>\n<p>    &#39;id&#39;: [1, 2, 2, 3, 4, 4, 5],<\/p>\n<p>    &#39;value&#39;: [10, 20, 20, 30, 40, 40, 50]<\/p>\n<p>})<\/p>\n<h2><strong>\u6570\u636e\u53bb\u91cd<\/strong><\/h2>\n<p>data = data.drop_duplicates()<\/p>\n<p>print(data)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h4>6.2 \u7f3a\u5931\u503c\u5904\u7406<\/h4>\n<\/p>\n<p><p>\u7f3a\u5931\u503c\u5904\u7406\u662f\u6307\u586b\u8865\u6216\u53bb\u9664\u6570\u636e\u96c6\u4e2d\u7684\u7f3a\u5931\u503c\u3002Pandas\u63d0\u4f9b\u4e86\u591a\u79cd\u65b9\u6cd5\u6765\u5904\u7406\u7f3a\u5931\u503c\u3002<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">import pandas as pd<\/p>\n<h2><strong>\u5047\u8bbe\u6709\u4e00\u4e2a\u6570\u636e\u96c6<\/strong><\/h2>\n<p>data = pd.DataFrame({<\/p>\n<p>    &#39;id&#39;: [1, 2, 3, 4, 5],<\/p>\n<p>    &#39;value&#39;: [10, None, 30, None, 50]<\/p>\n<p>})<\/p>\n<h2><strong>\u586b\u8865\u7f3a\u5931\u503c<\/strong><\/h2>\n<p>data[&#39;value&#39;] = data[&#39;value&#39;].fillna(data[&#39;value&#39;].mean())<\/p>\n<p>print(data)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h4>6.3 \u6570\u636e\u7c7b\u578b\u8f6c\u6362<\/h4>\n<\/p>\n<p><p>\u6570\u636e\u7c7b\u578b\u8f6c\u6362\u662f\u6307\u5c06\u6570\u636e\u96c6\u4e2d\u7684\u6570\u636e\u7c7b\u578b\u8fdb\u884c\u8f6c\u6362\uff0c\u4f8b\u5982\u5c06\u5b57\u7b26\u4e32\u8f6c\u6362\u4e3a\u6570\u503c\u3002Pandas\u63d0\u4f9b\u4e86\u65b9\u4fbf\u7684\u65b9\u6cd5\u6765\u8fdb\u884c\u6570\u636e\u7c7b\u578b\u8f6c\u6362\u3002<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">import pandas as pd<\/p>\n<h2><strong>\u5047\u8bbe\u6709\u4e00\u4e2a\u6570\u636e\u96c6<\/strong><\/h2>\n<p>data = pd.DataFrame({<\/p>\n<p>    &#39;id&#39;: [1, 2, 3, 4, 5],<\/p>\n<p>    &#39;value&#39;: [&#39;10&#39;, &#39;20&#39;, &#39;30&#39;, &#39;40&#39;, &#39;50&#39;]<\/p>\n<p>})<\/p>\n<h2><strong>\u6570\u636e\u7c7b\u578b\u8f6c\u6362<\/strong><\/h2>\n<p>data[&#39;value&#39;] = data[&#39;value&#39;].astype(int)<\/p>\n<p>print(data)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h3>\u4e03\u3001\u4f18\u5316\u6570\u636e\u5904\u7406\u6027\u80fd<\/h3>\n<\/p>\n<p><p>\u5728\u5904\u7406\u5927\u91cf\u6570\u636e\u65f6\uff0c\u4f18\u5316\u6570\u636e\u5904\u7406\u6027\u80fd\u662f\u975e\u5e38\u91cd\u8981\u7684\u3002\u4ee5\u4e0b\u662f\u4e00\u4e9b\u5e38\u89c1\u7684\u4f18\u5316\u6280\u5de7\u3002<\/p>\n<\/p>\n<p><h4>7.1 \u4f7f\u7528\u5408\u9002\u7684\u6570\u636e\u7ed3\u6784<\/h4>\n<\/p>\n<p><p>\u9009\u62e9\u5408\u9002\u7684\u6570\u636e\u7ed3\u6784\u53ef\u4ee5\u663e\u8457\u63d0\u9ad8\u6570\u636e\u5904\u7406\u7684\u6027\u80fd\u3002\u4f8b\u5982\uff0c\u4f7f\u7528NumPy\u6570\u7ec4\u4ee3\u66ffPython\u5217\u8868\u53ef\u4ee5\u63d0\u9ad8\u6570\u503c\u8ba1\u7b97\u7684\u6548\u7387\u3002<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">import numpy as np<\/p>\n<h2><strong>\u521b\u5efaNumPy\u6570\u7ec4<\/strong><\/h2>\n<p>data = np.random.rand(1000000)<\/p>\n<h2><strong>\u6570\u636e\u5904\u7406<\/strong><\/h2>\n<p>result = data * 2<\/p>\n<p>print(result[:5])<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h4>7.2 \u4f7f\u7528\u77e2\u91cf\u5316\u64cd\u4f5c<\/h4>\n<\/p>\n<p><p>\u77e2\u91cf\u5316\u64cd\u4f5c\u662f\u6307\u5728\u6570\u7ec4\u6216\u77e9\u9635\u4e0a\u8fdb\u884c\u9010\u5143\u7d20\u7684\u64cd\u4f5c\uff0c\u907f\u514d\u4f7f\u7528\u5faa\u73af\u3002NumPy\u548cPandas\u90fd\u652f\u6301\u77e2\u91cf\u5316\u64cd\u4f5c\uff0c\u53ef\u4ee5\u663e\u8457\u63d0\u9ad8\u6570\u636e\u5904\u7406\u7684\u6027\u80fd\u3002<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">import numpy as np<\/p>\n<h2><strong>\u521b\u5efaNumPy\u6570\u7ec4<\/strong><\/h2>\n<p>data = np.random.rand(1000000)<\/p>\n<h2><strong>\u77e2\u91cf\u5316\u64cd\u4f5c<\/strong><\/h2>\n<p>result = np.sqrt(data)<\/p>\n<p>print(result[:5])<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h4>7.3 \u4f7f\u7528\u5e76\u884c\u8ba1\u7b97<\/h4>\n<\/p>\n<p><p>\u5e76\u884c\u8ba1\u7b97\u53ef\u4ee5\u6709\u6548\u5730\u63d0\u9ad8\u6570\u636e\u5904\u7406\u7684\u6027\u80fd\u3002Python\u652f\u6301\u591a\u7ebf\u7a0b\u548c\u591a\u8fdb\u7a0b\u7f16\u7a0b\uff0c\u53ef\u4ee5\u5229\u7528\u591a\u6838CPU\u8fdb\u884c\u5e76\u884c\u8ba1\u7b97\u3002<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">import concurrent.futures<\/p>\n<p>def process_data(batch):<\/p>\n<p>    # \u6570\u636e\u5904\u7406<\/p>\n<p>    return [x * 2 for x in batch]<\/p>\n<p>data = range(1000000)<\/p>\n<p>batch_size = 10000<\/p>\n<h2><strong>\u4f7f\u7528\u591a\u7ebf\u7a0b\u8fdb\u884c\u5e76\u884c\u8ba1\u7b97<\/strong><\/h2>\n<p>with concurrent.futures.ThreadPoolExecutor() as executor:<\/p>\n<p>    results = list(executor.map(process_data, [data[i:i + batch_size] for i in range(0, len(data), batch_size)]))<\/p>\n<h2><strong>\u5408\u5e76\u7ed3\u679c<\/strong><\/h2>\n<p>result = [item for sublist in results for item in sublist]<\/p>\n<p>print(result[:5])<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h4>7.4 \u4f7f\u7528\u7f13\u5b58<\/h4>\n<\/p>\n<p><p>\u4f7f\u7528\u7f13\u5b58\u53ef\u4ee5\u907f\u514d\u91cd\u590d\u8ba1\u7b97\uff0c\u63d0\u9ad8\u6570\u636e\u5904\u7406\u7684\u6548\u7387\u3002Python\u7684<code>functools<\/code>\u6a21\u5757\u63d0\u4f9b\u4e86\u4e00\u4e2a\u7b80\u5355\u7684\u7f13\u5b58\u88c5\u9970\u5668<code>lru_cache<\/code>\u3002<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">from functools import lru_cache<\/p>\n<p>@lru_cache(maxsize=None)<\/p>\n<p>def compute(x):<\/p>\n<p>    # \u6a21\u62df\u8017\u65f6\u8ba1\u7b97<\/p>\n<p>    return x * 2<\/p>\n<p>data = range(1000)<\/p>\n<p>result = [compute(x) for x in data]<\/p>\n<p>print(result[:5])<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h3>\u516b\u3001\u603b\u7ed3<\/h3>\n<\/p>\n<p><p>\u672c\u6587\u8be6\u7ec6\u4ecb\u7ecd\u4e86\u5728Python\u4e2d\u4e00\u6b21\u8fd0\u884c\u5927\u91cf\u6570\u636e\u7684\u591a\u79cd\u65b9\u6cd5\uff0c\u5305\u62ec<strong>\u6279\u91cf\u5904\u7406\u3001\u5e76\u884c\u8ba1\u7b97\u3001\u591a\u7ebf\u7a0b\u548c\u591a\u8fdb\u7a0b\u3001\u4f7f\u7528\u9002\u5408\u5927\u6570\u636e\u5904\u7406\u7684\u5e93\u5982Pandas\u548cNumPy<\/strong>\u7b49\u3002\u6bcf\u79cd\u65b9\u6cd5\u90fd\u6709\u5176\u9002\u7528\u7684\u573a\u666f\u548c\u4f18\u7f3a\u70b9\uff0c\u9009\u62e9\u5408\u9002\u7684\u65b9\u6cd5\u53ef\u4ee5\u663e\u8457\u63d0\u9ad8\u6570\u636e\u5904\u7406\u7684\u6548\u7387\u3002\u5728\u5b9e\u9645\u5e94\u7528\u4e2d\uff0c\u53ef\u4ee5\u6839\u636e\u5177\u4f53\u9700\u6c42\u548c\u6570\u636e\u7279\u70b9\uff0c\u7075\u6d3b\u8fd0\u7528\u8fd9\u4e9b\u65b9\u6cd5\uff0c\u4ee5\u8fbe\u5230\u6700\u4f73\u7684\u6570\u636e\u5904\u7406\u6548\u679c\u3002<\/p>\n<\/p>\n<h2><strong>\u76f8\u5173\u95ee\u7b54FAQs\uff1a<\/strong><\/h2>\n<p> <strong>\u5982\u4f55\u4f7f\u7528Python\u5904\u7406\u5927\u91cf\u6570\u636e\uff1f<\/strong><br \/>\u5728Python\u4e2d\u5904\u7406\u5927\u91cf\u6570\u636e\u53ef\u4ee5\u901a\u8fc7\u591a\u79cd\u65b9\u5f0f\u5b9e\u73b0\u3002\u5e38\u89c1\u7684\u65b9\u6cd5\u5305\u62ec\u4f7f\u7528Pandas\u5e93\uff0c\u5b83\u53ef\u4ee5\u65b9\u4fbf\u5730\u5904\u7406\u5927\u89c4\u6a21\u6570\u636e\u96c6\u3002\u6b64\u5916\uff0cNumPy\u5e93\u4e5f\u80fd\u9ad8\u6548\u5730\u8fdb\u884c\u6570\u503c\u8ba1\u7b97\u3002\u5bf9\u4e8e\u66f4\u590d\u6742\u7684\u6570\u636e\u96c6\uff0c\u53ef\u4ee5\u8003\u8651\u4f7f\u7528Dask\u6216PySpark\uff0c\u8fd9\u4e9b\u5de5\u5177\u53ef\u4ee5\u5904\u7406\u5206\u5e03\u5f0f\u6570\u636e\uff0c\u63d0\u4f9b\u66f4\u9ad8\u7684\u6027\u80fd\u548c\u7075\u6d3b\u6027\u3002<\/p>\n<p><strong>\u5728Python\u4e2d\u5982\u4f55\u4f18\u5316\u6570\u636e\u5904\u7406\u6027\u80fd\uff1f<\/strong><br \/>\u4e3a\u4e86\u63d0\u9ad8Python\u5904\u7406\u5927\u91cf\u6570\u636e\u7684\u6027\u80fd\uff0c\u53ef\u4ee5\u8003\u8651\u4f7f\u7528\u77e2\u91cf\u5316\u64cd\u4f5c\uff0c\u907f\u514d\u4f7f\u7528\u5faa\u73af\u3002Pandas\u548cNumPy\u90fd\u652f\u6301\u8fd9\u6837\u7684\u64cd\u4f5c\uff0c\u80fd\u663e\u8457\u63d0\u5347\u901f\u5ea6\u3002\u6b64\u5916\uff0c\u5408\u7406\u4f7f\u7528\u5185\u5b58\u7ba1\u7406\u5de5\u5177\uff0c\u5982<code>gc<\/code>\u6a21\u5757\uff0c\u80fd\u591f\u5e2e\u52a9\u91ca\u653e\u4e0d\u518d\u9700\u8981\u7684\u5185\u5b58\u7a7a\u95f4\uff0c\u8fdb\u4e00\u6b65\u4f18\u5316\u6027\u80fd\u3002\u4f7f\u7528\u5408\u9002\u7684\u6570\u636e\u7c7b\u578b\uff08\u4f8b\u5982\u5c06\u6574\u6570\u7c7b\u578b\u8f6c\u6362\u4e3a\u66f4\u5c0f\u7684\u7c7b\u578b\uff09\u4e5f\u6709\u52a9\u4e8e\u51cf\u5c11\u5185\u5b58\u5360\u7528\u3002<\/p>\n<p><strong>\u4f7f\u7528Python\u65f6\u662f\u5426\u9700\u8981\u8003\u8651\u6570\u636e\u7684\u5b58\u50a8\u683c\u5f0f\uff1f<\/strong><br \/>\u662f\u7684\uff0c\u5b58\u50a8\u683c\u5f0f\u5bf9\u6570\u636e\u5904\u7406\u7684\u6548\u7387\u6709\u5f88\u5927\u5f71\u54cd\u3002\u5e38\u7528\u7684\u5b58\u50a8\u683c\u5f0f\u5305\u62ecCSV\u3001Parquet\u548cHDF5\u7b49\u3002CSV\u683c\u5f0f\u6613\u4e8e\u4f7f\u7528\uff0c\u4f46\u5bf9\u4e8e\u5927\u89c4\u6a21\u6570\u636e\u5904\u7406\u6548\u7387\u8f83\u4f4e\u3002Parquet\u548cHDF5\u683c\u5f0f\u652f\u6301\u538b\u7f29\u548c\u5feb\u901f\u8bfb\u53d6\uff0c\u9002\u5408\u5904\u7406\u5927\u6570\u636e\u96c6\u3002\u6839\u636e\u5177\u4f53\u9700\u6c42\u9009\u62e9\u5408\u9002\u7684\u5b58\u50a8\u683c\u5f0f\uff0c\u53ef\u4ee5\u663e\u8457\u63d0\u9ad8\u6570\u636e\u8bfb\u53d6\u548c\u5904\u7406\u7684\u901f\u5ea6\u3002<\/p>\n","protected":false},"excerpt":{"rendered":"Python\u5982\u4f55\u4e00\u6b21\u8fd0\u884c\u597d\u591a\u6570\u636e Python\u4e00\u6b21\u8fd0\u884c\u5927\u91cf\u6570\u636e\u7684\u65b9\u6cd5\u4e3b\u8981\u5305\u62ec\uff1a\u6279\u91cf\u5904\u7406\u3001\u5e76\u884c\u8ba1\u7b97\u3001\u591a\u7ebf\u7a0b\u548c\u591a\u8fdb [&hellip;]","protected":false},"author":3,"featured_media":1131916,"comment_status":"closed","ping_status":"","sticky":false,"template":"","format":"standard","meta":{"_acf_changed":false,"footnotes":""},"categories":[37],"tags":[],"acf":[],"_links":{"self":[{"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/posts\/1131910"}],"collection":[{"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/users\/3"}],"replies":[{"embeddable":true,"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/comments?post=1131910"}],"version-history":[{"count":"1","href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/posts\/1131910\/revisions"}],"predecessor-version":[{"id":1131917,"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/posts\/1131910\/revisions\/1131917"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/media\/1131916"}],"wp:attachment":[{"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/media?parent=1131910"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/categories?post=1131910"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/tags?post=1131910"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}