标签： Python3

使用python3拆分大文件txt 图文教程

最近网站被攻击，cdn一下被打2TB，然后下载了日志进行分析，但是日志文件有几十兆，所以需要做一下切割，这里我们记一下python3的拆分文件脚本，以备后用。

Python作为快速开发工具，其代码表达力强，开发效率高，因此用Python快速写一个，还是可行的。

python3代码脚本

import os
import sys
import random
import threading
import requests
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
import re
import time
lock = threading.Lock()
class TotalSizeCounter:
    def __init__(self):
        self.total_size = 0
        self.lock = threading.Lock()
    def add_size(self, size):
        with self.lock:
            self.total_size += size
    def get_total_size(self):
        with self.lock:
            return self.total_size
total_size_counter = TotalSizeCounter()
# 生成随机的User-Agent头部信息
def generate_user_agent():
    user_agents = [
        # iOS
        "Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/84.0.4147.122 Mobile/15E148 Safari/604.1",
        "Mozilla/5.0 (iPad; CPU OS 14_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/84.0.4147.122 Mobile/15E148 Safari/604.1",
        
        # Android
        "Mozilla/5.0 (Linux; Android 11; Pixel 5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.54 Mobile Safari/537.36",
        "Mozilla/5.0 (Linux; Android 11; SM-G998B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.54 Mobile Safari/537.36",
        
        # Windows
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.54 Safari/537.36",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/94.0.992.31",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Firefox/100.0",
        # macOS
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.54 Safari/537.36",
        
        # Linux
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.54 Safari/537.36",
        "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:95.0) Gecko/20100101 Firefox/95.0"
    ]
    return random.choice(user_agents)
def download_image(url, user_agent, output_folder):
    try:
        headers = {'User-Agent': user_agent}
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        parsed_url = urlparse(url)
        filename = os.path.join(output_folder, os.path.basename(parsed_url.path))
        filename = f"{os.path.splitext(filename)[0]}_{random.randint(1, 10000)}{os.path.splitext(filename)[1]}"
        with lock:
            with open(filename, 'wb') as file:
                file.write(response.content)
                file_size = os.path.getsize(filename)
                total_size_counter.add_size(file_size)
                print(f"Downloaded image {url} as {filename}, Size: {file_size / (1024 * 1024):.2f} MB")
    except Exception as e:
        print(f"Error downloading image {url}: {e}")
def download_images(url, user_agent, output_folder):
    try:
        headers = {'User-Agent': user_agent}
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        img_tags = soup.find_all('img')
        for img_tag in img_tags:
            img_url = img_tag.get('src')
            if img_url and not img_url.startswith(('data:', 'http:', 'https:')):
                img_url = urljoin(url, img_url)
                thread = threading.Thread(target=download_image, args=(img_url, user_agent, output_folder))
                thread.start()
                thread.join()
        img_urls_from_text = re.findall(r'<img[^>]*data-src=["\'](https?://[^"\']+\.(?:png|jpg|jpeg|gif|bmp))["\'][^>]*>', response.text)
        for img_url in img_urls_from_text:
            thread = threading.Thread(target=download_image, args=(img_url, user_agent, output_folder))
            thread.start()
            thread.join()
    except Exception as e:
        print(f"Error downloading images from {url}: {e}")
def main(url, num_iterations):
    start_time = time.time()  # 记录开始时间
    if not os.path.exists("files"):
        os.makedirs("files")
    threads = []
    for _ in range(num_iterations):
        user_agent = generate_user_agent()
        thread = threading.Thread(target=download_images, args=(url, user_agent, "files/"))
        thread.start()
        threads.append(thread)
    for thread in threads:
        thread.join()
    end_time = time.time()  # 记录结束时间
    execution_time = end_time - start_time
    total_downloaded_size_mb = total_size_counter.get_total_size() / (1024 * 1024)
    print(f"Total downloaded size from all threads: {total_downloaded_size_mb:.2f} MB")
    print(f"Script execution time: {execution_time:.2f} seconds")
    # 删除"files"目录及其内容
    if os.path.exists("files"):
        for file_name in os.listdir("files"):
            file_path = os.path.join("files", file_name)
            os.remove(file_path)
        os.rmdir("files")
if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: python script.py <url> <num_iterations>")
    else:
        url = sys.argv[1]
        num_iterations = int(sys.argv[2])
        main(url, num_iterations)

请在空目录汇总运行，脚本将自动当前目录创建files文件夹，脚本执行完成后删除所有下载的文件。

脚本将会计算所有下载的大小以及执行花费时间。

2023-08-10

python命令 – 解释器、交互式、面向对象的编程语言
Python是一种解释型的、交互式的、面向对象的编程语言，它结合了非凡的功能和非常清晰的语法。Python Library Reference记录了内置的和标准的类型、常量、函数和模块。
语法格式：python [参数]
常用参数：
-c 直接运行python语句
-v 会输出每一个模块引用信息
-i 运行完python脚本文件以后打开一个python环境
-m 将模块按照脚本执行
参考实例
直接运行python语句：
```
[root@jiloc ~]# python -c "print 'Hello world'"
```
查看python所有模块应用的信息：
```
[root@jiloc ~]# python -v
```
运行完python脚本文件以后打开一个python环境：
```
[root@jiloc ~]# python -i main.py
```
将模块按照脚本执行，实现一个简单的下载服务器：
```
[root@jiloc ~]# python -m SimpleHTTPServer 80
```
与该功能相关的Linux命令：
- users命令 – 显示当前登录的用户
- protoize命令 – 添加函数原型
- eval命令 – 重新运算求出参数
- zabbix_proxy命令 – zabbix代理守护进程
- addr2line命令 – 函数地址解析工具
- msgen命令 – 创建英文邮件目录
- wget命令 – 下载网络文件
- declare命令 – 声明shell变量
- xlsatoms命令 – 列出原子成分
- pigz命令 – 多线程的解压缩文件
2022-12-13
关于django.db.utils.NotSupportedError: deterministic=True requires SQLite 3.8.3错误的处理办法
最近，很多朋友都把自己的操作系统、Python、Django都升级到最新版本了，使用默认SQLite数据，运行时会报出类似django.db.utils.NotSupportedError: deterministic=True requires SQLite 3.8.3的错误提示。这主要是操作系统默认SQLite数据库版本太低造成的。这里给大家提供三种解决办法，希望能帮到大家。
第一种：
升级系统里的SQLite版本。方法：https://www.django.cn/forum/forum-487.html
注意，如需要最新版本的话，请从官网下载最新的软件包。下载址：https://www.sqlite.org/download.html
第二种：找到报错文件，如/usr/local/python3/lib/python3.8/site-packages/django/db/backends/sqlite3/base.py 打开它，找到
```
def check_sqlite_version():
    if Database.sqlite_version_info < (3, 9, 0):
```
把里面小括号里的 SQLite版本（3，9，0）修改成你当前系统里版本就好。
第三种：
使用第三方包运行SQLite。把sqlite3 更换为pysqlite3 和 pysqlite3-binary方法：1、安装pysqlite3和pysqlite3-binary
```
pip install pysqlite3
pip install pysqlite3-binary
```
2、打开文件/usr/local/python3/lib/python3.8/site-packages/django/db/backends/sqlite3/base.py，找到 from sqlite3 import dbapi2 as Database 注释它，添加代码
```
#from sqlite3 import dbapi2 as Database  #注释它
from pysqlite3 import dbapi2 as Database #新加这段代码
```
三种，总有一种适合你。
2021-12-04

Linux下使用Shell命令查看CPU占用率高的进程脚本

#!/bin/bash
TOPK={{topk}}
SECS={{samplingTime}}
INTERVAL={{interval}}
STEPS=$(( $SECS / $INTERVAL ))
TEMP_FILE_PREFIX="/tmp/tat_public_cpu_usage"
echo Watching CPU usage...
for((i=0;i<$STEPS;i++))
do
  ps -eocomm,pcpu | tail -n +2 >> $TEMP_FILE_PREFIX.$$
  sleep $INTERVAL
done
echo
echo CPU eaters :
cat $TEMP_FILE_PREFIX.$$ | \
awk '
{ process[$1]+=$2;}
END{
  for(i in process) {
    printf("%-20s %s\n",i, process[i]) ;
  }
}' | sort -nrk 2 | head -n $TOPK
rm $TEMP_FILE_PREFIX.$$

参数名	参数值	描述

topk		前k个进程
samplingTime		采样时间, 单位秒
interval		采样间隔时间, 单位秒

本脚本来自腾讯云

2021-09-14

vscode 去除多余空行
使用正则表达式 ^\s*(?=\r?$)\n
```
^\s*(?=\r?$)\n
```
2021-09-03
经典的 Fork 炸弹解析
Jaromil 在 2002 年设计了最为精简的一个 Linux Fork 炸弹，整行代码只有 13 个字符，在 shell 中运行后几秒后系统就会宕机：
```
:(){ :|:& };:
```
这样看起来不是很好理解，我们可以更改下格式：
```
:(){	:|:&};:
```
更好理解一点的话就是这样:
```
bomb(){	bomb|bomb&};bomb
```
因为 shell 中函数可以省略 function 关键字，所以上面的 13 个字符的功能是定义一个函数并调用这个函数，函数的名称为 : ，主要的核心代码是 :|:&，可以看出这是一个对函数本身的递归调用，通过 & 实现在后台开启新进程运行，通过管道实现进程呈几何形式增长，最后再通过 : 来调用此函数引爆炸弹。因此，几秒钟内，系统就会因为处理不过来太多的进程而死机，解决的唯一办法就是重启。
Bomb 一下
秉着不作不死的心态，我们也来运行一下，于是我将矛头指向云主机，我使用了一个 2G 内存的云主机，首先在本地开启两个终端，在一个终端连接云主机后运行炸弹，几秒后再尝试用另外一个终端登录，效果可以看下图：
看，运行一段时间后直接报出了 -bash: fork: Cannot allocate memory，说明内存不足了。并且我在二号终端上尝试连接也没有任何反应。因为是虚拟的云主机，所以我只能通过主机服务商的后台来给主机断电重启。然后才能重新登录：
炸弹的危害
Fork 炸弹带来的后果就是耗尽服务器资源，使服务器不能正常的对外提供服务，也就是常说的 DoS。与传统 1v1、通过不断向服务器发送请求造成服务器崩溃不同，Fork 炸弹有种坐山观虎斗，不费一兵一卒斩敌人于马下的感觉。更吓人的是这个函数是不需要 root 权限就可以运行的。看到网上有帖子说某些人将个性签名改为 Fork 炸弹，结果真有好奇之人中枪，试想如果中枪的人是在公司服务器上运行的话，oh，！
预防方式
当然，Fork 炸弹没有那么可怕，用其它语言也可以分分钟写出来一个，例如，Python 版：
```
import os 	
    while True:  	
    os.fork()
```
Fork 炸弹的本质无非就是靠创建进程来抢占系统资源，在 Linux 中，我们可以通过 ulimit 命令来限制用户的某些行为，运行 ulimit -a 可以查看我们能做哪些限制：
```
ubuntu@10-10-57-151:~$ ulimit -acore file size          (blocks, -c) 0data seg size           (kbytes, -d) unlimitedscheduling priority             (-e) 0file size               (blocks, -f) unlimitedpending signals                 (-i) 7782max locked memory       (kbytes, -l) 64max memory size         (kbytes, -m) unlimitedopen files                      (-n) 1024pipe size            (512 bytes, -p) 8POSIX message queues     (bytes, -q) 819200real-time priority              (-r) 0stack size              (kbytes, -s) 8192cpu time               (seconds, -t) unlimitedmax user processes              (-u) 7782virtual memory          (kbytes, -v) unlimitedfile locks                      (-x) unlimited
```
可以看到，-u 参数可以限制用户创建进程数，因此，我们可以使用 ulimit -u 20 来允许用户最多创建 20 个进程。这样就可以预防 Fork 炸弹。但这样是不彻底的，关闭终端后这个命令就失效了。我们可以通过修改 /etc/security/limits.conf 文件来进行更深层次的预防，在文件里添加如下一行（下面的 ubuntu 需更换为你的用户名）：
```
ubuntu - nproc 20
```
这样，退出后重新登录，就会发现最大进程数已经更改为 20 了：
这个时候我们再次运行炸弹就不会报内存不足了，而是提示 -bash: fork: retry: No child processes，说明 Linux 限制了炸弹创建进程。
2021-08-25

Django Python 后台列表中自定义权限筛选根据用户权限自动筛选列表

在Django的项目开发中，需求是要在后台得列表中非管理员用户只列出自己的相关记录。下面我们记录一下 admin.py 详细的代码并给出详解。

class Sale_Admin(admin.ModelAdmin):
    list_display = ('customer','idc', 'line_type', 'charge_type',  'saler', 'graph_id', 'alarm_times')
    list_per_page = 20
    # list_editable = ['graph_id',]
    list_filter = ('customer', 'saler')
    search_fields = ['customer__name','idc__name','graph_id']
    ordering = ('idc',)
    readonly_fields = ('v_max', 'v_nf', 'update_time', 'alarm_times')
    def get_queryset(self, request):
        qs = super(Sale_Admin,self).get_queryset(request)
        # if request.user.is_superuser:
        #     return qs
        return qs.filter(saler=23)
    def has_add_permission(self, request):
        return False
    def has_change_permission(self, request, obj=None):
        return False
    def has_delete_permission(self, request, obj=None):
        return False
    def has_view_permission(self, request, obj=None):
        return False

get_queryset 可以自定义对象列表，比如非管理员登录，我们只显示saler值为23的记录。

当然还可以控制当前用是否能查看、修改、添加、删除权限。

has_add_permission has_change_permission has_delete_permission has_view_permission 则是判断是否具有对应的权限。

Django 官方文档：https://docs.djangoproject.com/en/3.1/ref/class-based-views/mixins-single-object/#django.views.generic.detail.SingleObjectMixin.get_queryset

2020-08-19

django models反向关联related_name详解

先上表结构 models.py 代码如下

class PeojectUser(models.Model):
    # 项目参与者
    user = models.ForeignKey(to='UserInfo',on_delete=models.CASCADE)
    project = models.ForeignKey(to='Project',on_delete=models.CASCADE)
    star = models.BooleanField(default=False)
    invitee = models.ForeignKey(to='UserInfo',on_delete=models.CASCADE)
    create_datetime = models.DateTimeField(auto_now_add=True)

在django 执行 makemigrations 的时候，如果出现下面这种错误

SystemCheckError: System check identified some issues:
ERRORS:
web.PeojectUser.invitee: (fields.E304) Reverse accessor for 'PeojectUser.invitee' clashes with reverse accessor for 'PeojectUser.user'.
        HINT: Add or change a related_name argument to the definition for 'PeojectUser.invitee' or 'PeojectUser.user'.
web.PeojectUser.user: (fields.E304) Reverse accessor for 'PeojectUser.user' clashes with reverse accessor for 'PeojectUser.invitee'.
        HINT: Add or change a related_name argument to the definition for 'PeojectUser.user' or 'PeojectUser.invitee'.

这个问题出现的原因很简单，就是因为我们我们在PeojectUser 项目中有2个字段都进行了一对多的引用了UserInfo 表，这样会给后面进行反向查询的时候，造成django无法处理，因此提示是说我们是需要指定related_name 。
先说一个常规的问题：
正常我们的操作：

obj = UserInfo.objects.get(id=1)
obj.projectuser_set.all()

但是由于我们在PeojectUser设置了2个关联，因此在我们后续执行的时候，django不知道需要反向关联那个字段了。所以提示我们添加 related_name 字段
针对上面的例子，我们做一下修改：

```python
class PeojectUser(models.Model):
    # 项目参与者
    user = models.ForeignKey(to='UserInfo',on_delete=models.CASCADE,related_name='a')
    project = models.ForeignKey(to='Project',on_delete=models.CASCADE)
    star = models.BooleanField(default=False)
    invitee = models.ForeignKey(to='UserInfo',on_delete=models.CASCADE,related_name='b')
    create_datetime = models.DateTimeField(auto_now_add=True)

那么在设置了上面的格式后，如果要进行不同的反向关联，则就可以使用如下方式：

obj = UserInfo.objects.get(id=1)
obj.a.all()
obj.b.all()

2020-08-06

python3 requests解决中文乱码问题的几种方法
有三种方法解决请求后乱码问题。
方法一：获取二进制数据，再利用str进行编码转换
```
import requests
url='http://music.baidu.com'
r = requests.get(url)
html=r.content
html_doc=str(html,'utf-8') #html_doc=html.decode("utf-8","ignore")
print(html_doc)
```
方法二：使用r.text
Requests 会自动解码来自服务器的内容。大多数 unicode 字符集都能被无缝地解码。请求发出后，Requests 会基于 HTTP 头部对响应的编码作出有根据的推测。当你访问 r.text 之时，Requests 会使用其推测的文本编码。你可以找出 Requests 使用了什么编码，并且能够使用 r.encoding 属性来改变它.
但是Requests库的自身编码为: r.encoding = ‘ISO-8859-1’
可以 r.encoding 修改编码
```
import requests
url='http://music.baidu.com'
r=requests.get(url)
r.encoding='utf-8'
print(r.text)
```
方法三：apparent_encoding获取网页编码
上面的两个方法，适用于网页编码是utf-8的情况，但如果网页的默认编码不是utf-8，那么在设置编码的话就无从下手的，毕竟编码的类型那么多。这样可以使用 apparent_encoding获取网页使用的编码，在进行设置，如下：
```
import requests
url='http://laoji.org'
r=requests.get(url)
print(r.apparent_encoding)
r.encoding=r.apparent_encoding
print(r.text)
```
2020-06-30
windows pip使用国内源图文教程
前面我们记录了MacOS 下 Python3 pip 配置国内源，下面我们记录一下Windows下更新pip源的方法。
win+R 打开用户目录%HOMEPATH%，在此目录下创建 pip 文件夹，在 pip 目录下创建 pip.ini 文件, 内容如下
在pip文件夹里新建的pip.ini代码如下
```
[global]
timeout = 6000
index-url = https://pypi.tuna.tsinghua.edu.cn/simple
trusted-host = pypi.tuna.tsinghua.edu.cn
```
2020-06-24

MacOS 下 Python3 pip 配置国内源

国内源列表（推荐用阿里云的）

阿里云	http://mirrors.aliyun.com/pypi/simple/
豆瓣 (douban)	http://pypi.douban.com/simple/
中国科技大学	https://pypi.mirrors.ustc.edu.cn/simple/
清华大学	https://pypi.tuna.tsinghua.edu.cn/simple/
中国科学技术大学	http://pypi.mirrors.ustc.edu.cn/simple/

方法一：安装时指定

pip install ipython -i http://mirrors.aliyun.com/pypi/simple/

如果提示 host 不被信任可以加上参数 –trusted-host

pip install ipython -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com

方法二：永久设置

mkdir ~/.pip
vi ~/.pip/pip.conf # 创建一个pip配置文件# 写入配置

写入如下代码：

[global]
index-url = http://mirrors.aliyun.com/pypi/simple/
[install]
trusted-host = mirrors.aliyun.com

2020-06-24

VPS库存监控系统
最近闲来无事，弄了一个基于的django+vue的东西出来，主要是用于监控一些商家的VPS库存情况。我们是vue新手，边学边做，有不对的地方还望大佬指点~
安装/Install
本环境需要python3的运行环境以及python3-pip下载源码后运行 ` pip3 install django `
然后将数据库导入
```
python3 manage.py makemigrations --merge ; python3 manage.py migrate
```
创建后台超级用户
```
 python3 manage.py createsuperuser
```
开始启动服务
```
python3 manage.py runserver 0.0.0.0:80
```
启动后，可以在浏览器中打开 http://127.0.0.1/admin 的“商品”及“商家”中填入数据，否则首页会运行报错。
定时监控/Crontab
定时每分钟检查VPS商家库存情况，需要在商家的属性中设定`检查库存`：`是`然后将以下命令加入到crontab中。(Linux命令详解：crontab 定时任务)
```
 */1 * * * * cd /data/wwwroot/vps-stock-monitor/ ; python3 manage.py run > /dev/null 2>&1 &
```
如在windows系统中，需要加入到计划任务中。
2020-06-24

-c	直接运行python语句
-v	会输出每一个模块引用信息
-i	运行完python脚本文件以后打开一个python环境
-m	将模块按照脚本执行

标签： Python3

python3代码脚本

与该功能相关的Linux命令：

Bomb 一下

炸弹的危害

预防方式

方法一：获取二进制数据，再利用str进行编码转换

方法二：使用r.text

方法三：apparent_encoding获取网页编码

方法一：安装时指定

方法二：永久设置

安装/Install

创建后台超级用户

开始启动服务

定时监控/Crontab