{"id":1160771,"date":"2025-01-13T19:08:57","date_gmt":"2025-01-13T11:08:57","guid":{"rendered":"https:\/\/docs.pingcode.com\/ask\/ask-ask\/1160771.html"},"modified":"2025-01-13T19:09:01","modified_gmt":"2025-01-13T11:09:01","slug":"python%e7%88%ac%e8%99%ab%e5%a6%82%e4%bd%95%e5%ae%9a%e4%bd%8d%e7%9b%ae%e6%a0%87%e6%95%b0%e6%8d%ae","status":"publish","type":"post","link":"https:\/\/docs.pingcode.com\/ask\/1160771.html","title":{"rendered":"python\u722c\u866b\u5982\u4f55\u5b9a\u4f4d\u76ee\u6807\u6570\u636e"},"content":{"rendered":"<p style=\"text-align:center;\" ><img decoding=\"async\" src=\"https:\/\/cdn-kb.worktile.com\/kb\/wp-content\/uploads\/2024\/04\/25202155\/f945bbdc-4922-4491-a078-650e578e2563.webp\" alt=\"python\u722c\u866b\u5982\u4f55\u5b9a\u4f4d\u76ee\u6807\u6570\u636e\" \/><\/p>\n<p><p> \u5728Python\u722c\u866b\u4e2d\uff0c\u5b9a\u4f4d\u76ee\u6807\u6570\u636e\u7684\u6838\u5fc3\u65b9\u6cd5\u662f\u4f7f\u7528 <strong>\u9009\u62e9\u5408\u9002\u7684\u89e3\u6790\u5e93\u3001\u5206\u6790\u7f51\u9875\u7ed3\u6784\u3001\u9009\u62e9\u9002\u5f53\u7684\u5b9a\u4f4d\u65b9\u6cd5\u3001\u7ed3\u5408\u6b63\u5219\u8868\u8fbe\u5f0f<\/strong>\u3002\u5176\u4e2d\uff0c\u9009\u62e9\u5408\u9002\u7684\u89e3\u6790\u5e93\u662f\u6700\u4e3a\u91cd\u8981\u7684\u4e00\u70b9\u3002Python\u4e2d\u5e38\u7528\u7684\u89e3\u6790\u5e93\u5305\u62ecBeautifulSoup\u3001lxml\u548cScrapy\u3002BeautifulSoup\u662f\u4e00\u4e2a\u7b80\u5355\u6613\u7528\u7684HTML\u548cXML\u89e3\u6790\u5e93\uff0c\u9002\u5408\u5904\u7406\u7ed3\u6784\u590d\u6742\u4e14\u9700\u8981\u7075\u6d3b\u89e3\u6790\u7684\u7f51\u9875\u3002\u63a5\u4e0b\u6765\uff0c\u6211\u4eec\u8be6\u7ec6\u4ecb\u7ecd\u5982\u4f55\u4f7f\u7528BeautifulSoup\u6765\u5b9a\u4f4d\u76ee\u6807\u6570\u636e\u3002<\/p>\n<\/p>\n<p><p><strong>BeautifulSoup\u89e3\u6790\u5e93\u7684\u4f7f\u7528<\/strong><\/p>\n<\/p>\n<p><p>BeautifulSoup\u662fPython\u4e2d\u975e\u5e38\u6d41\u884c\u7684HTML\u89e3\u6790\u5e93\uff0c\u80fd\u591f\u5feb\u901f\u3001\u7075\u6d3b\u5730\u89e3\u6790HTML\u548cXML\u6587\u6863\u3002\u4f7f\u7528BeautifulSoup\u89e3\u6790\u7f51\u9875\u6570\u636e\u7684\u6b65\u9aa4\u5982\u4e0b\uff1a<\/p>\n<\/p>\n<ol>\n<li><strong>\u5b89\u88c5BeautifulSoup<\/strong><\/p>\n<p>\u9996\u5148\uff0c\u9700\u8981\u5b89\u88c5BeautifulSoup\u5e93\u3002\u5728\u547d\u4ee4\u884c\u4e2d\u8fd0\u884c\u4ee5\u4e0b\u547d\u4ee4\uff1a<\/li>\n<\/p>\n<\/ol>\n<p><pre><code class=\"language-bash\">pip install beautifulsoup4<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><p>\u540c\u65f6\uff0c\u901a\u5e38\u8fd8\u9700\u8981\u5b89\u88c5<code>lxml<\/code>\u5e93\u6765\u52a0\u901f\u89e3\u6790\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-bash\">pip install lxml<\/p>\n<p><\/code><\/pre>\n<\/p>\n<ol start=\"2\">\n<li><strong>\u52a0\u8f7d\u7f51\u9875\u5185\u5bb9<\/strong><\/p>\n<p>\u4f7f\u7528requests\u5e93\u83b7\u53d6\u7f51\u9875\u5185\u5bb9\uff1a<\/li>\n<\/p>\n<\/ol>\n<p><pre><code class=\"language-python\">import requests<\/p>\n<p>from bs4 import BeautifulSoup<\/p>\n<p>url = &quot;http:\/\/example.com&quot;<\/p>\n<p>response = requests.get(url)<\/p>\n<p>html_content = response.content<\/p>\n<p><\/code><\/pre>\n<\/p>\n<ol start=\"3\">\n<li><strong>\u89e3\u6790\u7f51\u9875\u5185\u5bb9<\/strong><\/p>\n<p>\u4f7f\u7528BeautifulSoup\u89e3\u6790\u7f51\u9875\u5185\u5bb9\uff1a<\/li>\n<\/p>\n<\/ol>\n<p><pre><code class=\"language-python\">soup = BeautifulSoup(html_content, &#39;lxml&#39;)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<ol start=\"4\">\n<li><strong>\u67e5\u627e\u76ee\u6807\u6570\u636e<\/strong><\/p>\n<p>\u6839\u636eHTML\u7ed3\u6784\uff0c\u9009\u62e9\u5408\u9002\u7684\u65b9\u6cd5\u67e5\u627e\u76ee\u6807\u6570\u636e\uff1a<\/li>\n<\/p>\n<\/ol>\n<p><pre><code class=\"language-python\"># \u67e5\u627e\u6240\u6709\u7684&lt;p&gt;\u6807\u7b7e<\/p>\n<p>paragraphs = soup.find_all(&#39;p&#39;)<\/p>\n<h2><strong>\u67e5\u627e\u5177\u6709\u7279\u5b9a\u7c7b\u540d\u7684\u6807\u7b7e<\/strong><\/h2>\n<p>specific_class = soup.find_all(&#39;div&#39;, class_=&#39;classname&#39;)<\/p>\n<h2><strong>\u67e5\u627e\u5177\u6709\u7279\u5b9aid\u7684\u6807\u7b7e<\/strong><\/h2>\n<p>specific_id = soup.find(&#39;div&#39;, id=&#39;specificid&#39;)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h3>\u4e00\u3001\u9009\u62e9\u5408\u9002\u7684\u89e3\u6790\u5e93<\/h3>\n<\/p>\n<p><p>Python\u4e2d\u6709\u591a\u79cd\u89e3\u6790\u5e93\u53ef\u4f9b\u9009\u62e9\uff0c\u5e38\u7528\u7684\u5305\u62ecBeautifulSoup\u3001lxml\u548cScrapy\u3002\u6bcf\u4e2a\u89e3\u6790\u5e93\u90fd\u6709\u5176\u72ec\u7279\u7684\u4f18\u52bf\u548c\u9002\u7528\u573a\u666f\u3002<\/p>\n<\/p>\n<p><h4>1. BeautifulSoup<\/h4>\n<\/p>\n<p><p><strong>BeautifulSoup<\/strong>\u662f\u4e00\u4e2a\u7b80\u5355\u6613\u7528\u7684HTML\u548cXML\u89e3\u6790\u5e93\uff0c\u9002\u5408\u5904\u7406\u7ed3\u6784\u590d\u6742\u4e14\u9700\u8981\u7075\u6d3b\u89e3\u6790\u7684\u7f51\u9875\u3002\u5b83\u5177\u6709\u4ee5\u4e0b\u7279\u70b9\uff1a<\/p>\n<\/p>\n<ul>\n<li>\u6613\u4e8e\u5b66\u4e60\u548c\u4f7f\u7528\uff0c\u9002\u5408\u521d\u5b66\u8005\u3002<\/li>\n<li>\u652f\u6301\u591a\u79cd\u89e3\u6790\u5668\uff08\u5982html.parser\u3001lxml\u3001html5lib\u7b49\uff09\uff0c\u53ef\u4ee5\u6839\u636e\u9700\u6c42\u9009\u62e9\u3002<\/li>\n<li>\u63d0\u4f9b\u4e30\u5bcc\u7684\u67e5\u627e\u548c\u7b5b\u9009\u529f\u80fd\uff0c\u80fd\u591f\u65b9\u4fbf\u5730\u63d0\u53d6\u6240\u9700\u6570\u636e\u3002<\/li>\n<\/ul>\n<p><p>\u793a\u4f8b\u4ee3\u7801\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">from bs4 import BeautifulSoup<\/p>\n<p>html_content = &quot;&lt;html&gt;&lt;body&gt;&lt;p&gt;Hello, World!&lt;\/p&gt;&lt;\/body&gt;&lt;\/html&gt;&quot;<\/p>\n<p>soup = BeautifulSoup(html_content, &#39;lxml&#39;)<\/p>\n<p>print(soup.p.text)  # \u8f93\u51fa\uff1aHello, World!<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h4>2. lxml<\/h4>\n<\/p>\n<p><p><strong>lxml<\/strong>\u662f\u4e00\u4e2a\u9ad8\u6027\u80fd\u7684HTML\u548cXML\u89e3\u6790\u5e93\uff0c\u9002\u5408\u5904\u7406\u5927\u89c4\u6a21\u6570\u636e\u548c\u9700\u8981\u9ad8\u6548\u89e3\u6790\u7684\u573a\u666f\u3002\u5b83\u5177\u6709\u4ee5\u4e0b\u7279\u70b9\uff1a<\/p>\n<\/p>\n<ul>\n<li>\u89e3\u6790\u901f\u5ea6\u5feb\uff0c\u6027\u80fd\u4f18\u8d8a\u3002<\/li>\n<li>\u63d0\u4f9b\u5b8c\u6574\u7684XPath\u652f\u6301\uff0c\u80fd\u591f\u901a\u8fc7XPath\u8868\u8fbe\u5f0f\u5feb\u901f\u5b9a\u4f4d\u76ee\u6807\u6570\u636e\u3002<\/li>\n<li>\u652f\u6301HTML\u548cXML\u4e24\u79cd\u683c\u5f0f\u3002<\/li>\n<\/ul>\n<p><p>\u793a\u4f8b\u4ee3\u7801\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">from lxml import etree<\/p>\n<p>html_content = &quot;&lt;html&gt;&lt;body&gt;&lt;p&gt;Hello, World!&lt;\/p&gt;&lt;\/body&gt;&lt;\/html&gt;&quot;<\/p>\n<p>tree = etree.HTML(html_content)<\/p>\n<p>result = tree.xpath(&#39;\/\/p\/text()&#39;)<\/p>\n<p>print(result)  # \u8f93\u51fa\uff1a[&#39;Hello, World!&#39;]<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h4>3. Scrapy<\/h4>\n<\/p>\n<p><p><strong>Scrapy<\/strong>\u662f\u4e00\u4e2a\u529f\u80fd\u5f3a\u5927\u7684\u7f51\u7edc\u722c\u866b\u6846\u67b6\uff0c\u9002\u5408\u5f00\u53d1\u590d\u6742\u7684\u722c\u866b\u9879\u76ee\u548c\u9700\u8981\u9ad8\u6548\u6293\u53d6\u5927\u91cf\u6570\u636e\u7684\u573a\u666f\u3002\u5b83\u5177\u6709\u4ee5\u4e0b\u7279\u70b9\uff1a<\/p>\n<\/p>\n<ul>\n<li>\u63d0\u4f9b\u4e86\u5b8c\u6574\u7684\u722c\u866b\u6846\u67b6\uff0c\u652f\u6301\u5f02\u6b65\u8bf7\u6c42\u548c\u5e76\u53d1\u5904\u7406\u3002<\/li>\n<li>\u5185\u7f6e\u4e30\u5bcc\u7684\u4e2d\u95f4\u4ef6\u548c\u7ba1\u9053\uff0c\u65b9\u4fbf\u8fdb\u884c\u6570\u636e\u5904\u7406\u548c\u5b58\u50a8\u3002<\/li>\n<li>\u652f\u6301\u591a\u79cd\u89e3\u6790\u5e93\uff08\u5982BeautifulSoup\u3001lxml\u7b49\uff09\uff0c\u53ef\u4ee5\u6839\u636e\u9700\u6c42\u9009\u62e9\u3002<\/li>\n<\/ul>\n<p><p>\u793a\u4f8b\u4ee3\u7801\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">import scrapy<\/p>\n<p>class MySpider(scrapy.Spider):<\/p>\n<p>    name = &#39;myspider&#39;<\/p>\n<p>    start_urls = [&#39;http:\/\/example.com&#39;]<\/p>\n<p>    def parse(self, response):<\/p>\n<p>        for p in response.xpath(&#39;\/\/p\/text()&#39;):<\/p>\n<p>            yield {&#39;text&#39;: p.get()}<\/p>\n<h2><strong>\u8fd0\u884c\u722c\u866b<\/strong><\/h2>\n<h2><strong>scrapy runspider myspider.py<\/strong><\/h2>\n<p><\/code><\/pre>\n<\/p>\n<p><h3>\u4e8c\u3001\u5206\u6790\u7f51\u9875\u7ed3\u6784<\/h3>\n<\/p>\n<p><p>\u5728\u8fdb\u884c\u6570\u636e\u89e3\u6790\u4e4b\u524d\uff0c\u9996\u5148\u9700\u8981\u5206\u6790\u7f51\u9875\u7684\u7ed3\u6784\u3002\u901a\u8fc7\u67e5\u770b\u7f51\u9875\u7684HTML\u6e90\u4ee3\u7801\uff0c\u53ef\u4ee5\u4e86\u89e3\u7f51\u9875\u7684\u5c42\u6b21\u7ed3\u6784\u548c\u6807\u7b7e\u5206\u5e03\uff0c\u4ece\u800c\u786e\u5b9a\u76ee\u6807\u6570\u636e\u6240\u5728\u7684\u4f4d\u7f6e\u3002<\/p>\n<\/p>\n<p><h4>1. \u4f7f\u7528\u6d4f\u89c8\u5668\u5f00\u53d1\u8005\u5de5\u5177<\/h4>\n<\/p>\n<p><p>\u73b0\u4ee3\u6d4f\u89c8\u5668\uff08\u5982Chrome\u3001Firefox\u7b49\uff09\u90fd\u63d0\u4f9b\u4e86\u5f00\u53d1\u8005\u5de5\u5177\uff0c\u53ef\u4ee5\u65b9\u4fbf\u5730\u67e5\u770b\u7f51\u9875\u7684HTML\u6e90\u4ee3\u7801\u548cCSS\u6837\u5f0f\u3002\u901a\u8fc7\u53f3\u952e\u70b9\u51fb\u7f51\u9875\u4e0a\u7684\u76ee\u6807\u6570\u636e\uff0c\u7136\u540e\u9009\u62e9\u201c\u68c0\u67e5\u201d\u6216\u201c\u67e5\u770b\u5143\u7d20\u201d\uff0c\u53ef\u4ee5\u6253\u5f00\u5f00\u53d1\u8005\u5de5\u5177\uff0c\u5e76\u5b9a\u4f4d\u5230\u5bf9\u5e94\u7684HTML\u6807\u7b7e\u3002<\/p>\n<\/p>\n<p><h4>2. \u7406\u89e3HTML\u7ed3\u6784<\/h4>\n<\/p>\n<p><p>HTML\u6587\u6863\u901a\u5e38\u7531\u4e00\u7cfb\u5217\u5d4c\u5957\u7684\u6807\u7b7e\u7ec4\u6210\uff0c\u901a\u8fc7\u6807\u7b7e\u7684\u5c42\u6b21\u7ed3\u6784\u53ef\u4ee5\u786e\u5b9a\u76ee\u6807\u6570\u636e\u7684\u5177\u4f53\u4f4d\u7f6e\u3002\u4f8b\u5982\uff0c\u4e0b\u9762\u662f\u4e00\u4e2a\u7b80\u5355\u7684HTML\u7ed3\u6784\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-html\">&lt;!DOCTYPE html&gt;<\/p>\n<p>&lt;html&gt;<\/p>\n<p>&lt;head&gt;<\/p>\n<p>    &lt;title&gt;Example Page&lt;\/title&gt;<\/p>\n<p>&lt;\/head&gt;<\/p>\n<p>&lt;body&gt;<\/p>\n<p>    &lt;div class=&quot;content&quot;&gt;<\/p>\n<p>        &lt;h1&gt;Header&lt;\/h1&gt;<\/p>\n<p>        &lt;p&gt;Paragraph 1&lt;\/p&gt;<\/p>\n<p>        &lt;p&gt;Paragraph 2&lt;\/p&gt;<\/p>\n<p>    &lt;\/div&gt;<\/p>\n<p>&lt;\/body&gt;<\/p>\n<p>&lt;\/html&gt;<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><p>\u5728\u8fd9\u4e2a\u4f8b\u5b50\u4e2d\uff0c\u76ee\u6807\u6570\u636e\u201cParagraph 1\u201d\u548c\u201cParagraph 2\u201d\u4f4d\u4e8e<code>&lt;div class=&quot;content&quot;&gt;<\/code>\u6807\u7b7e\u5185\u7684\u4e24\u4e2a<code>&lt;p&gt;<\/code>\u6807\u7b7e\u4e2d\u3002<\/p>\n<\/p>\n<p><h3>\u4e09\u3001\u9009\u62e9\u9002\u5f53\u7684\u5b9a\u4f4d\u65b9\u6cd5<\/h3>\n<\/p>\n<p><p>\u6839\u636e\u7f51\u9875\u7684\u7ed3\u6784\uff0c\u53ef\u4ee5\u9009\u62e9\u5408\u9002\u7684\u5b9a\u4f4d\u65b9\u6cd5\u6765\u63d0\u53d6\u76ee\u6807\u6570\u636e\u3002\u5e38\u7528\u7684\u5b9a\u4f4d\u65b9\u6cd5\u5305\u62ec\u6807\u7b7e\u9009\u62e9\u5668\u3001\u7c7b\u9009\u62e9\u5668\u3001ID\u9009\u62e9\u5668\u548c\u5c5e\u6027\u9009\u62e9\u5668\u3002<\/p>\n<\/p>\n<p><h4>1. \u6807\u7b7e\u9009\u62e9\u5668<\/h4>\n<\/p>\n<p><p>\u6807\u7b7e\u9009\u62e9\u5668\u662f\u6839\u636eHTML\u6807\u7b7e\u540d\u79f0\u6765\u9009\u62e9\u5143\u7d20\u7684\u3002\u9002\u7528\u4e8e\u9700\u8981\u63d0\u53d6\u6240\u6709\u76f8\u540c\u6807\u7b7e\u7684\u60c5\u51b5\u3002<\/p>\n<\/p>\n<p><pre><code class=\"language-python\"># \u67e5\u627e\u6240\u6709\u7684&lt;p&gt;\u6807\u7b7e<\/p>\n<p>paragraphs = soup.find_all(&#39;p&#39;)<\/p>\n<p>for p in paragraphs:<\/p>\n<p>    print(p.text)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h4>2. \u7c7b\u9009\u62e9\u5668<\/h4>\n<\/p>\n<p><p>\u7c7b\u9009\u62e9\u5668\u662f\u6839\u636e\u5143\u7d20\u7684class\u5c5e\u6027\u6765\u9009\u62e9\u5143\u7d20\u7684\u3002\u9002\u7528\u4e8e\u9700\u8981\u63d0\u53d6\u5177\u6709\u7279\u5b9a\u7c7b\u540d\u7684\u5143\u7d20\u3002<\/p>\n<\/p>\n<p><pre><code class=\"language-python\"># \u67e5\u627e\u5177\u6709\u7279\u5b9a\u7c7b\u540d\u7684\u6807\u7b7e<\/p>\n<p>specific_class = soup.find_all(&#39;div&#39;, class_=&#39;content&#39;)<\/p>\n<p>for div in specific_class:<\/p>\n<p>    print(div.text)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h4>3. ID\u9009\u62e9\u5668<\/h4>\n<\/p>\n<p><p>ID\u9009\u62e9\u5668\u662f\u6839\u636e\u5143\u7d20\u7684id\u5c5e\u6027\u6765\u9009\u62e9\u5143\u7d20\u7684\u3002\u9002\u7528\u4e8e\u9700\u8981\u63d0\u53d6\u5177\u6709\u552f\u4e00\u6807\u8bc6\u7684\u5143\u7d20\u3002<\/p>\n<\/p>\n<p><pre><code class=\"language-python\"># \u67e5\u627e\u5177\u6709\u7279\u5b9aid\u7684\u6807\u7b7e<\/p>\n<p>specific_id = soup.find(&#39;div&#39;, id=&#39;specificid&#39;)<\/p>\n<p>print(specific_id.text)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h4>4. \u5c5e\u6027\u9009\u62e9\u5668<\/h4>\n<\/p>\n<p><p>\u5c5e\u6027\u9009\u62e9\u5668\u662f\u6839\u636e\u5143\u7d20\u7684\u7279\u5b9a\u5c5e\u6027\uff08\u5982name\u3001href\u7b49\uff09\u6765\u9009\u62e9\u5143\u7d20\u7684\u3002\u9002\u7528\u4e8e\u9700\u8981\u63d0\u53d6\u5177\u6709\u7279\u5b9a\u5c5e\u6027\u7684\u5143\u7d20\u3002<\/p>\n<\/p>\n<p><pre><code class=\"language-python\"># \u67e5\u627e\u5177\u6709\u7279\u5b9a\u5c5e\u6027\u7684\u6807\u7b7e<\/p>\n<p>specific_attr = soup.find_all(&#39;a&#39;, href=&#39;http:\/\/example.com&#39;)<\/p>\n<p>for a in specific_attr:<\/p>\n<p>    print(a.text)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h3>\u56db\u3001\u7ed3\u5408\u6b63\u5219\u8868\u8fbe\u5f0f<\/h3>\n<\/p>\n<p><p>\u5728\u67d0\u4e9b\u60c5\u51b5\u4e0b\uff0c\u7f51\u9875\u7684\u7ed3\u6784\u53ef\u80fd\u6bd4\u8f83\u590d\u6742\uff0c\u5355\u7eaf\u4f9d\u9760\u9009\u62e9\u5668\u96be\u4ee5\u51c6\u786e\u5b9a\u4f4d\u76ee\u6807\u6570\u636e\u3002\u8fd9\u65f6\uff0c\u53ef\u4ee5\u7ed3\u5408\u6b63\u5219\u8868\u8fbe\u5f0f\u8fdb\u884c\u5339\u914d\u548c\u63d0\u53d6\u3002<\/p>\n<\/p>\n<p><h4>1. \u4f7f\u7528re\u5e93<\/h4>\n<\/p>\n<p><p>Python\u4e2d\u7684re\u5e93\u63d0\u4f9b\u4e86\u4e30\u5bcc\u7684\u6b63\u5219\u8868\u8fbe\u5f0f\u529f\u80fd\uff0c\u53ef\u4ee5\u65b9\u4fbf\u5730\u8fdb\u884c\u6a21\u5f0f\u5339\u914d\u548c\u5b57\u7b26\u4e32\u63d0\u53d6\u3002<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">import re<\/p>\n<p>html_content = &quot;&lt;html&gt;&lt;body&gt;&lt;p&gt;Hello, World!&lt;\/p&gt;&lt;\/body&gt;&lt;\/html&gt;&quot;<\/p>\n<p>pattern = re.compile(r&#39;&lt;p&gt;(.*?)&lt;\/p&gt;&#39;)<\/p>\n<p>result = pattern.findall(html_content)<\/p>\n<p>print(result)  # \u8f93\u51fa\uff1a[&#39;Hello, World!&#39;]<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h4>2. \u5728BeautifulSoup\u4e2d\u4f7f\u7528\u6b63\u5219\u8868\u8fbe\u5f0f<\/h4>\n<\/p>\n<p><p>BeautifulSoup\u652f\u6301\u5728\u67e5\u627e\u5143\u7d20\u65f6\u4f7f\u7528\u6b63\u5219\u8868\u8fbe\u5f0f\uff0c\u53ef\u4ee5\u66f4\u7075\u6d3b\u5730\u5339\u914d\u76ee\u6807\u6570\u636e\u3002<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">import re<\/p>\n<p>from bs4 import BeautifulSoup<\/p>\n<p>html_content = &quot;&lt;html&gt;&lt;body&gt;&lt;p class=&#39;content&#39;&gt;Hello, World!&lt;\/p&gt;&lt;\/body&gt;&lt;\/html&gt;&quot;<\/p>\n<p>soup = BeautifulSoup(html_content, &#39;lxml&#39;)<\/p>\n<h2><strong>\u4f7f\u7528\u6b63\u5219\u8868\u8fbe\u5f0f\u5339\u914dclass\u5c5e\u6027<\/strong><\/h2>\n<p>pattern = re.compile(r&#39;content&#39;)<\/p>\n<p>result = soup.find_all(&#39;p&#39;, class_=pattern)<\/p>\n<p>for p in result:<\/p>\n<p>    print(p.text)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h3>\u4e94\u3001\u5904\u7406\u52a8\u6001\u7f51\u9875<\/h3>\n<\/p>\n<p><p>\u6709\u4e9b\u7f51\u9875\u7684\u5185\u5bb9\u662f\u901a\u8fc7JavaScript\u52a8\u6001\u52a0\u8f7d\u7684\uff0c\u76f4\u63a5\u89e3\u6790HTML\u6e90\u4ee3\u7801\u65e0\u6cd5\u83b7\u53d6\u5230\u5b8c\u6574\u7684\u6570\u636e\u3002\u8fd9\u65f6\uff0c\u53ef\u4ee5\u4f7f\u7528Selenium\u5e93\u6765\u6a21\u62df\u6d4f\u89c8\u5668\u64cd\u4f5c\uff0c\u5e76\u83b7\u53d6\u52a8\u6001\u52a0\u8f7d\u540e\u7684\u7f51\u9875\u5185\u5bb9\u3002<\/p>\n<\/p>\n<p><h4>1. \u5b89\u88c5Selenium<\/h4>\n<\/p>\n<p><p>\u9996\u5148\uff0c\u9700\u8981\u5b89\u88c5Selenium\u5e93\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-bash\">pip install selenium<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><p>\u540c\u65f6\uff0c\u8fd8\u9700\u8981\u4e0b\u8f7d\u4e0e\u6d4f\u89c8\u5668\u5bf9\u5e94\u7684WebDriver\uff08\u5982ChromeDriver\uff09\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-bash\"># \u4e0b\u8f7d\u5e76\u89e3\u538bChromeDriver\uff0c\u5c06\u5176\u8def\u5f84\u6dfb\u52a0\u5230\u73af\u5883\u53d8\u91cf\u4e2d<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h4>2. \u4f7f\u7528Selenium\u83b7\u53d6\u52a8\u6001\u7f51\u9875\u5185\u5bb9<\/h4>\n<\/p>\n<p><p>\u4f7f\u7528Selenium\u6a21\u62df\u6d4f\u89c8\u5668\u64cd\u4f5c\uff0c\u5e76\u83b7\u53d6\u52a8\u6001\u52a0\u8f7d\u540e\u7684\u7f51\u9875\u5185\u5bb9\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">from selenium import webdriver<\/p>\n<p>from bs4 import BeautifulSoup<\/p>\n<h2><strong>\u521d\u59cb\u5316WebDriver<\/strong><\/h2>\n<p>driver = webdriver.Chrome()<\/p>\n<h2><strong>\u6253\u5f00\u7f51\u9875<\/strong><\/h2>\n<p>url = &quot;http:\/\/example.com&quot;<\/p>\n<p>driver.get(url)<\/p>\n<h2><strong>\u7b49\u5f85\u9875\u9762\u52a0\u8f7d\u5b8c\u6210<\/strong><\/h2>\n<p>driver.implicitly_w<a href=\"https:\/\/docs.pingcode.com\/blog\/59162.html\" target=\"_blank\">AI<\/a>t(10)<\/p>\n<h2><strong>\u83b7\u53d6\u7f51\u9875\u5185\u5bb9<\/strong><\/h2>\n<p>html_content = driver.page_source<\/p>\n<h2><strong>\u89e3\u6790\u7f51\u9875\u5185\u5bb9<\/strong><\/h2>\n<p>soup = BeautifulSoup(html_content, &#39;lxml&#39;)<\/p>\n<h2><strong>\u67e5\u627e\u76ee\u6807\u6570\u636e<\/strong><\/h2>\n<p>result = soup.find_all(&#39;p&#39;)<\/p>\n<p>for p in result:<\/p>\n<p>    print(p.text)<\/p>\n<h2><strong>\u5173\u95edWebDriver<\/strong><\/h2>\n<p>driver.quit()<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h3>\u516d\u3001\u5904\u7406\u53cd\u722c\u866b\u673a\u5236<\/h3>\n<\/p>\n<p><p>\u5728\u8fdb\u884c\u7f51\u9875\u6570\u636e\u6293\u53d6\u65f6\uff0c\u53ef\u80fd\u4f1a\u9047\u5230\u4e00\u4e9b\u53cd\u722c\u866b\u673a\u5236\uff0c\u5982IP\u5c01\u7981\u3001\u9a8c\u8bc1\u7801\u3001\u4eba\u673a\u9a8c\u8bc1\u7b49\u3002\u4e3a\u4e86\u5e94\u5bf9\u8fd9\u4e9b\u673a\u5236\uff0c\u53ef\u4ee5\u91c7\u53d6\u4ee5\u4e0b\u63aa\u65bd\uff1a<\/p>\n<\/p>\n<p><h4>1. \u8bbe\u7f6e\u8bf7\u6c42\u5934<\/h4>\n<\/p>\n<p><p>\u901a\u8fc7\u8bbe\u7f6e\u5408\u9002\u7684\u8bf7\u6c42\u5934\uff0c\u53ef\u4ee5\u6a21\u62df\u771f\u5b9e\u7528\u6237\u7684\u6d4f\u89c8\u5668\u8bf7\u6c42\uff0c\u907f\u514d\u88ab\u8bc6\u522b\u4e3a\u722c\u866b\u3002<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">headers = {<\/p>\n<p>    &#39;User-Agent&#39;: &#39;Mozilla\/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/58.0.3029.110 Safari\/537.3&#39;<\/p>\n<p>}<\/p>\n<p>response = requests.get(url, headers=headers)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h4>2. \u4f7f\u7528\u4ee3\u7406<\/h4>\n<\/p>\n<p><p>\u901a\u8fc7\u4f7f\u7528\u4ee3\u7406IP\uff0c\u53ef\u4ee5\u907f\u514d\u56e0\u9891\u7e41\u8bbf\u95ee\u540c\u4e00IP\u800c\u88ab\u5c01\u7981\u3002<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">proxies = {<\/p>\n<p>    &#39;http&#39;: &#39;http:\/\/10.10.1.10:3128&#39;,<\/p>\n<p>    &#39;https&#39;: &#39;http:\/\/10.10.1.10:1080&#39;,<\/p>\n<p>}<\/p>\n<p>response = requests.get(url, headers=headers, proxies=proxies)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h4>3. \u8bbe\u7f6e\u8bf7\u6c42\u95f4\u9694<\/h4>\n<\/p>\n<p><p>\u901a\u8fc7\u8bbe\u7f6e\u5408\u7406\u7684\u8bf7\u6c42\u95f4\u9694\uff0c\u907f\u514d\u9891\u7e41\u8bbf\u95ee\u540c\u4e00\u7f51\u7ad9\uff0c\u964d\u4f4e\u88ab\u5c01\u7981\u7684\u98ce\u9669\u3002<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">import time<\/p>\n<p>for i in range(10):<\/p>\n<p>    response = requests.get(url, headers=headers)<\/p>\n<p>    print(response.status_code)<\/p>\n<p>    time.sleep(2)  # \u8bbe\u7f6e2\u79d2\u7684\u8bf7\u6c42\u95f4\u9694<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h3>\u4e03\u3001\u6570\u636e\u5b58\u50a8<\/h3>\n<\/p>\n<p><p>\u5728\u6210\u529f\u6293\u53d6\u76ee\u6807\u6570\u636e\u540e\uff0c\u9700\u8981\u5c06\u6570\u636e\u5b58\u50a8\u5230\u5408\u9002\u7684\u5b58\u50a8\u4ecb\u8d28\u4e2d\uff0c\u4ee5\u4fbf\u540e\u7eed\u5206\u6790\u548c\u5904\u7406\u3002\u5e38\u7528\u7684\u6570\u636e\u5b58\u50a8\u65b9\u5f0f\u5305\u62ec\u6587\u4ef6\u5b58\u50a8\u3001\u6570\u636e\u5e93\u5b58\u50a8\u548c\u4e91\u5b58\u50a8\u3002<\/p>\n<\/p>\n<p><h4>1. \u6587\u4ef6\u5b58\u50a8<\/h4>\n<\/p>\n<p><p>\u5c06\u6570\u636e\u5b58\u50a8\u5230\u672c\u5730\u6587\u4ef6\u4e2d\uff0c\u9002\u7528\u4e8e\u6570\u636e\u91cf\u8f83\u5c0f\u7684\u60c5\u51b5\u3002<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">with open(&#39;data.txt&#39;, &#39;w&#39;, encoding=&#39;utf-8&#39;) as file:<\/p>\n<p>    for p in result:<\/p>\n<p>        file.write(p.text + &#39;\\n&#39;)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h4>2. \u6570\u636e\u5e93\u5b58\u50a8<\/h4>\n<\/p>\n<p><p>\u5c06\u6570\u636e\u5b58\u50a8\u5230\u6570\u636e\u5e93\u4e2d\uff0c\u9002\u7528\u4e8e\u6570\u636e\u91cf\u8f83\u5927\u4e14\u9700\u8981\u9ad8\u6548\u67e5\u8be2\u7684\u60c5\u51b5\u3002\u5e38\u7528\u7684\u6570\u636e\u5e93\u5305\u62ecSQLite\u3001MySQL\u3001PostgreSQL\u7b49\u3002<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">import sqlite3<\/p>\n<h2><strong>\u8fde\u63a5\u6570\u636e\u5e93<\/strong><\/h2>\n<p>conn = sqlite3.connect(&#39;data.db&#39;)<\/p>\n<p>cursor = conn.cursor()<\/p>\n<h2><strong>\u521b\u5efa\u8868<\/strong><\/h2>\n<p>cursor.execute(&#39;&#39;&#39;<\/p>\n<p>    CREATE TABLE IF NOT EXISTS data (<\/p>\n<p>        id INTEGER PRIMARY KEY AUTOINCREMENT,<\/p>\n<p>        content TEXT<\/p>\n<p>    )<\/p>\n<p>&#39;&#39;&#39;)<\/p>\n<h2><strong>\u63d2\u5165\u6570\u636e<\/strong><\/h2>\n<p>for p in result:<\/p>\n<p>    cursor.execute(&#39;INSERT INTO data (content) VALUES (?)&#39;, (p.text,))<\/p>\n<h2><strong>\u63d0\u4ea4\u4e8b\u52a1<\/strong><\/h2>\n<p>conn.commit()<\/p>\n<h2><strong>\u5173\u95ed\u8fde\u63a5<\/strong><\/h2>\n<p>conn.close()<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h4>3. \u4e91\u5b58\u50a8<\/h4>\n<\/p>\n<p><p>\u5c06\u6570\u636e\u5b58\u50a8\u5230\u4e91\u5b58\u50a8\u670d\u52a1\uff08\u5982AWS S3\u3001Google Cloud Storage\u7b49\uff09\u4e2d\uff0c\u9002\u7528\u4e8e\u9700\u8981\u9ad8\u53ef\u7528\u6027\u548c\u5206\u5e03\u5f0f\u5b58\u50a8\u7684\u60c5\u51b5\u3002<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">import boto3<\/p>\n<h2><strong>\u521d\u59cb\u5316S3\u5ba2\u6237\u7aef<\/strong><\/h2>\n<p>s3 = boto3.client(&#39;s3&#39;)<\/p>\n<h2><strong>\u4e0a\u4f20\u6570\u636e\u5230S3<\/strong><\/h2>\n<p>s3.put_object(Bucket=&#39;mybucket&#39;, Key=&#39;data.txt&#39;, Body=&#39;\\n&#39;.join(p.text for p in result))<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><p>\u603b\u7ed3\uff1a\u901a\u8fc7\u9009\u62e9\u5408\u9002\u7684\u89e3\u6790\u5e93\u3001\u5206\u6790\u7f51\u9875\u7ed3\u6784\u3001\u9009\u62e9\u9002\u5f53\u7684\u5b9a\u4f4d\u65b9\u6cd5\u3001\u7ed3\u5408\u6b63\u5219\u8868\u8fbe\u5f0f\u3001\u5904\u7406\u52a8\u6001\u7f51\u9875\u3001\u5e94\u5bf9\u53cd\u722c\u866b\u673a\u5236\u548c\u5b58\u50a8\u6570\u636e\uff0c\u53ef\u4ee5\u6709\u6548\u5730\u5b9e\u73b0Python\u722c\u866b\u7684\u6570\u636e\u5b9a\u4f4d\u548c\u63d0\u53d6\u3002\u5e0c\u671b\u672c\u6587\u7684\u5185\u5bb9\u5bf9\u60a8\u6709\u6240\u5e2e\u52a9\u3002<\/p>\n<\/p>\n<h2><strong>\u76f8\u5173\u95ee\u7b54FAQs\uff1a<\/strong><\/h2>\n<p> <strong>\u5982\u4f55\u9009\u62e9\u5408\u9002\u7684\u722c\u866b\u5e93\u6765\u5b9a\u4f4d\u76ee\u6807\u6570\u636e\uff1f<\/strong><br \/>\u5728\u8fdb\u884cPython\u722c\u866b\u65f6\uff0c\u9009\u62e9\u5408\u9002\u7684\u5e93\u81f3\u5173\u91cd\u8981\u3002\u5e38\u7528\u7684\u5e93\u5305\u62ecBeautifulSoup\u3001Scrapy\u548cRequests\u7b49\u3002BeautifulSoup\u9002\u5408\u89e3\u6790HTML\u548cXML\u6587\u6863\uff0cScrapy\u5219\u662f\u4e00\u4e2a\u5f3a\u5927\u7684\u6846\u67b6\uff0c\u9002\u5408\u5904\u7406\u590d\u6742\u7684\u722c\u53d6\u4efb\u52a1\uff0c\u800cRequests\u5219\u4e3b\u8981\u7528\u4e8e\u53d1\u9001\u7f51\u7edc\u8bf7\u6c42\u3002\u5728\u9009\u62e9\u65f6\uff0c\u8003\u8651\u76ee\u6807\u7f51\u7ad9\u7684\u7ed3\u6784\u548c\u6570\u636e\u7c7b\u578b\uff0c\u4ee5\u53ca\u722c\u866b\u7684\u590d\u6742\u5ea6\uff0c\u9009\u62e9\u6700\u9002\u5408\u7684\u5de5\u5177\u4ee5\u63d0\u9ad8\u6548\u7387\u3002<\/p>\n<p><strong>\u5982\u4f55\u5904\u7406\u52a8\u6001\u7f51\u9875\u4ee5\u83b7\u53d6\u76ee\u6807\u6570\u636e\uff1f<\/strong><br \/>\u8bb8\u591a\u7f51\u7ad9\u4f7f\u7528JavaScript\u52a8\u6001\u52a0\u8f7d\u5185\u5bb9\uff0c\u56e0\u6b64\u5728\u722c\u53d6\u65f6\u53ef\u80fd\u65e0\u6cd5\u76f4\u63a5\u83b7\u53d6\u6240\u9700\u6570\u636e\u3002\u5bf9\u4e8e\u8fd9\u79cd\u60c5\u51b5\uff0c\u53ef\u4ee5\u4f7f\u7528Selenium\u7b49\u5de5\u5177\u6a21\u62df\u6d4f\u89c8\u5668\u64cd\u4f5c\uff0c\u7b49\u5f85\u7f51\u9875\u52a0\u8f7d\u5b8c\u6210\u540e\u518d\u63d0\u53d6\u6570\u636e\u3002\u6b64\u5916\uff0c\u4f7f\u7528API\u8c03\u7528\u83b7\u53d6\u6570\u636e\u4e5f\u662f\u4e00\u4e2a\u6709\u6548\u7684\u66ff\u4ee3\u65b9\u6848\uff0c\u8bb8\u591a\u7f51\u7ad9\u5728\u540e\u53f0\u63d0\u4f9bRESTful API\uff0c\u53ef\u4ee5\u76f4\u63a5\u8bf7\u6c42\u5e76\u89e3\u6790\u8fd4\u56de\u7684JSON\u6570\u636e\u3002<\/p>\n<p><strong>\u5982\u4f55\u907f\u514d\u88ab\u7f51\u7ad9\u5c01\u7981\uff1f<\/strong><br \/>\u5728\u8fdb\u884c\u6570\u636e\u722c\u53d6\u65f6\uff0c\u9075\u5faa\u7f51\u7ad9\u7684robots.txt\u534f\u8bae\u662f\u975e\u5e38\u91cd\u8981\u7684\uff0c\u5b83\u6307\u793a\u4e86\u5141\u8bb8\u548c\u7981\u6b62\u722c\u53d6\u7684\u5185\u5bb9\u3002\u6b64\u5916\uff0c\u8bbe\u7f6e\u9002\u5f53\u7684\u8bf7\u6c42\u95f4\u9694\u548c\u968f\u673aUser-Agent\u53ef\u4ee5\u51cf\u5c11\u88ab\u5c01\u7981\u7684\u98ce\u9669\u3002\u4f7f\u7528\u4ee3\u7406IP\u4e5f\u662f\u4e00\u79cd\u5e38\u89c1\u7684\u7b56\u7565\uff0c\u53ef\u4ee5\u5e2e\u52a9\u5206\u6563\u8bf7\u6c42\u6765\u6e90\uff0c\u964d\u4f4e\u88ab\u8bc6\u522b\u4e3a\u722c\u866b\u7684\u53ef\u80fd\u6027\u3002\u4fdd\u6301\u826f\u597d\u7684\u722c\u866b\u9053\u5fb7\uff0c\u786e\u4fdd\u4e0d\u5bf9\u76ee\u6807\u7f51\u7ad9\u9020\u6210\u8d1f\u62c5\uff0c\u662f\u5b9e\u73b0\u957f\u4e45\u722c\u53d6\u7684\u5173\u952e\u3002<\/p>\n","protected":false},"excerpt":{"rendered":"\u5728Python\u722c\u866b\u4e2d\uff0c\u5b9a\u4f4d\u76ee\u6807\u6570\u636e\u7684\u6838\u5fc3\u65b9\u6cd5\u662f\u4f7f\u7528 \u9009\u62e9\u5408\u9002\u7684\u89e3\u6790\u5e93\u3001\u5206\u6790\u7f51\u9875\u7ed3\u6784\u3001\u9009\u62e9\u9002\u5f53\u7684\u5b9a\u4f4d\u65b9\u6cd5\u3001\u7ed3\u5408\u6b63 [&hellip;]","protected":false},"author":3,"featured_media":1160783,"comment_status":"closed","ping_status":"","sticky":false,"template":"","format":"standard","meta":{"_acf_changed":false,"footnotes":""},"categories":[37],"tags":[],"acf":[],"_links":{"self":[{"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/posts\/1160771"}],"collection":[{"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/users\/3"}],"replies":[{"embeddable":true,"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/comments?post=1160771"}],"version-history":[{"count":"1","href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/posts\/1160771\/revisions"}],"predecessor-version":[{"id":1160786,"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/posts\/1160771\/revisions\/1160786"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/media\/1160783"}],"wp:attachment":[{"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/media?parent=1160771"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/categories?post=1160771"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/tags?post=1160771"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}