This commit is contained in:
commit
76bd37dd11
128 changed files with 11672 additions and 0 deletions
3
.gitattributes
vendored
Normal file
3
.gitattributes
vendored
Normal file
|
@ -0,0 +1,3 @@
|
|||
*.js linguist-language=python
|
||||
*.css linguist-language=python
|
||||
*.html linguist-language=python
|
17
.github/workflows/main.yaml
vendored
Normal file
17
.github/workflows/main.yaml
vendored
Normal file
|
@ -0,0 +1,17 @@
|
|||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
|
||||
jobs:
|
||||
contrib-readme-job:
|
||||
runs-on: ubuntu-latest
|
||||
name: A job to automate contrib in readme
|
||||
permissions:
|
||||
contents: write
|
||||
pull-requests: write
|
||||
steps:
|
||||
- name: Contribute List
|
||||
uses: akhilmhdh/contributors-readme-action@v2.3.10
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
175
.gitignore
vendored
Normal file
175
.gitignore
vendored
Normal file
|
@ -0,0 +1,175 @@
|
|||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
*.py,cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
cover/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
db.sqlite3-journal
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
.pybuilder/
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# IPython
|
||||
profile_default/
|
||||
ipython_config.py
|
||||
|
||||
# pyenv
|
||||
# For a library or package, you might want to ignore these files since the code is
|
||||
# intended to run in multiple environments; otherwise, check them in:
|
||||
# .python-version
|
||||
|
||||
# pipenv
|
||||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||
# install all needed dependencies.
|
||||
#Pipfile.lock
|
||||
|
||||
# poetry
|
||||
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||
# commonly ignored for libraries.
|
||||
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||
#poetry.lock
|
||||
|
||||
# pdm
|
||||
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||
#pdm.lock
|
||||
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
||||
# in version control.
|
||||
# https://pdm.fming.dev/#use-with-ide
|
||||
.pdm.toml
|
||||
|
||||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||
__pypackages__/
|
||||
|
||||
# Celery stuff
|
||||
celerybeat-schedule
|
||||
celerybeat.pid
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
|
||||
# Pyre type checker
|
||||
.pyre/
|
||||
|
||||
# pytype static type analyzer
|
||||
.pytype/
|
||||
|
||||
# Cython debug symbols
|
||||
cython_debug/
|
||||
|
||||
# PyCharm
|
||||
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||
#.idea/
|
||||
|
||||
*.xml
|
||||
*.iml
|
||||
.idea
|
||||
/temp_image/
|
||||
/browser_data/
|
||||
/data/
|
||||
|
||||
*/.DS_Store
|
||||
.vscode
|
||||
|
||||
#New add
|
||||
test_parse.py
|
||||
test_soup.py
|
||||
test.htmlcov
|
28
LICENSE
Normal file
28
LICENSE
Normal file
|
@ -0,0 +1,28 @@
|
|||
非商业使用许可证 1.0
|
||||
|
||||
版权所有 (c) [2024] [relakkes@gmail.com]
|
||||
|
||||
鉴于:
|
||||
1. 版权所有者拥有和控制本软件和相关文档文件(以下简称“软件”)的版权;
|
||||
2. 使用者希望使用该软件;
|
||||
3. 版权所有者愿意在本许可证所述的条件下授权使用者使用该软件;
|
||||
|
||||
现因此,双方遵循相关法律法规,同意如下条款:
|
||||
|
||||
授权范围:
|
||||
1. 版权所有者特此免费授予接受本许可证的任何自然人或法人(以下简称“使用者”)非独占的、不可转让的权利,在非商业目的下使用、复制、修改、合并本软件,前提是遵守以下条件。
|
||||
|
||||
条件:
|
||||
1. 使用者必须在软件及其副本的所有合理显著位置包含上述版权声明和本许可证声明。
|
||||
2. 本软件不得用于任何商业目的,包括但不限于销售、营利或商业竞争。
|
||||
3. 未经版权所有者书面同意,不得将本软件用于任何商业用途。
|
||||
|
||||
免责声明:
|
||||
1. 本软件按“现状”提供,不提供任何形式的明示或暗示保证,包括但不限于对适销性、特定用途的适用性和非侵权的保证。
|
||||
2. 在任何情况下,版权所有者均不对因使用本软件而产生的,或在任何方式上与本软件有关的任何直接、间接、偶然、特殊、示例性或后果性损害负责(包括但不限于采购替代品或服务;使用、数据或利润的损失;或业务中断),无论这些损害是如何引起的,以及无论是通过合同、严格责任还是侵权行为(包括疏忽或其他方式)产生的,即使已被告知此类损害的可能性。
|
||||
|
||||
适用法律:
|
||||
1. 本许可证的解释和执行应遵循当地法律法规。
|
||||
2. 因本许可证引起的或与之相关的任何争议,双方应友好协商解决;协商不成时,任何一方可将争议提交至版权所有者所在地的人民法院诉讼解决。
|
||||
|
||||
本许可证构成双方之间关于本软件的完整协议,取代并合并以前的讨论、交流和协议,无论是口头还是书面的。
|
405
README.md
Normal file
405
README.md
Normal file
|
@ -0,0 +1,405 @@
|
|||
> **免责声明:**
|
||||
>
|
||||
> 大家请以学习为目的使用本仓库,爬虫违法违规的案件:https://github.com/HiddenStrawberry/Crawler_Illegal_Cases_In_China <br>
|
||||
>
|
||||
>本仓库的所有内容仅供学习和参考之用,禁止用于商业用途。任何人或组织不得将本仓库的内容用于非法用途或侵犯他人合法权益。本仓库所涉及的爬虫技术仅用于学习和研究,不得用于对其他平台进行大规模爬虫或其他非法行为。对于因使用本仓库内容而引起的任何法律责任,本仓库不承担任何责任。使用本仓库的内容即表示您同意本免责声明的所有条款和条件。
|
||||
|
||||
> 点击查看更为详细的免责声明。[点击跳转](#disclaimer)
|
||||
# 仓库描述
|
||||
|
||||
**小红书爬虫**,**抖音爬虫**, **快手爬虫**, **B站爬虫**, **微博爬虫**...。
|
||||
目前能抓取小红书、抖音、快手、B站、微博的视频、图片、评论、点赞、转发等信息。
|
||||
|
||||
原理:利用[playwright](https://playwright.dev/)搭桥,保留登录成功后的上下文浏览器环境,通过执行JS表达式获取一些加密参数
|
||||
通过使用此方式,免去了复现核心加密JS代码,逆向难度大大降低
|
||||
|
||||
|
||||
## 功能列表
|
||||
| 平台 | 关键词搜索 | 指定帖子ID爬取 | 二级评论 | 指定创作者主页 | 登录态缓存 | IP代理池 | 生成评论词云图 |
|
||||
|-----|-------|----------|-----|--------|-------|-------|-------|
|
||||
| 小红书 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| 抖音 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| 快手 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| B 站 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| 微博 | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ |
|
||||
|
||||
|
||||
## 使用方法
|
||||
|
||||
### 创建并激活 python 虚拟环境
|
||||
```shell
|
||||
# 进入项目根目录
|
||||
cd MediaCrawler
|
||||
|
||||
# 创建虚拟环境
|
||||
# 注意python 版本需要3.7 - 3.9 高于该版本可能会出现一些依赖包兼容问题
|
||||
python -m venv venv
|
||||
|
||||
# macos & linux 激活虚拟环境
|
||||
source venv/bin/activate
|
||||
|
||||
# windows 激活虚拟环境
|
||||
venv\Scripts\activate
|
||||
|
||||
```
|
||||
|
||||
### 安装依赖库
|
||||
|
||||
```shell
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
### 安装 playwright浏览器驱动
|
||||
|
||||
```shell
|
||||
playwright install
|
||||
```
|
||||
|
||||
### 运行爬虫程序
|
||||
|
||||
```shell
|
||||
### 项目默认是没有开启评论爬取模式,如需评论请在config/base_config.py中的 ENABLE_GET_COMMENTS 变量修改
|
||||
### 一些其他支持项,也可以在config/base_config.py查看功能,写的有中文注释
|
||||
|
||||
# 从配置文件中读取关键词搜索相关的帖子并爬取帖子信息与评论
|
||||
python main.py --platform xhs --lt qrcode --type search
|
||||
|
||||
# 从配置文件中读取指定的帖子ID列表获取指定帖子的信息与评论信息
|
||||
python main.py --platform xhs --lt qrcode --type detail
|
||||
|
||||
# 打开对应APP扫二维码登录
|
||||
|
||||
# 其他平台爬虫使用示例,执行下面的命令查看
|
||||
python main.py --help
|
||||
```
|
||||
|
||||
### 数据保存
|
||||
- 支持保存到关系型数据库(Mysql、PgSQL等)
|
||||
- 执行 `python db.py` 初始化数据库数据库表结构(只在首次执行)
|
||||
- 支持保存到csv中(data/目录下)
|
||||
- 支持保存到json中(data/目录下)
|
||||
|
||||
|
||||
## 开发者服务
|
||||
- 知识星球:沉淀高质量常见问题、最佳实践文档、多年编程+爬虫经验分享,提供付费知识星球服务,主动提问,作者会定期回答问题 (每天 1 快钱订阅我的知识服务)
|
||||
<p>
|
||||
<img alt="xingqiu" src="https://nm.zizhi1.com/static/img/8e1312d1f52f2e0ff436ea7196b4e27b.15555424244122T1.webp" style="width: auto;height: 400px" >
|
||||
</p>
|
||||
|
||||
星球精选文章:
|
||||
- [【独创】使用Playwright获取某音a_bogus参数流程(包含加密参数分析)](https://articles.zsxq.com/id_u89al50jk9x0.html)
|
||||
- [【独创】使用Playwright低成本获取某书X-s参数流程分析(当年的回忆录)](https://articles.zsxq.com/id_u4lcrvqakuc7.html)
|
||||
- [ MediaCrawler-基于抽象类设计重构项目缓存](https://articles.zsxq.com/id_4ju73oxewt9j.html)
|
||||
- [ 手把手带你撸一个自己的IP代理池](https://articles.zsxq.com/id_38fza371ladm.html)
|
||||
|
||||
|
||||
|
||||
- MediaCrawler视频课程:
|
||||
> 如果你想很快入门这个项目,或者想了具体实现原理,我推荐你看看这个视频课程,从设计出发一步步带你如何使用,门槛大大降低,同时也是对我开源的支持,如果你能支持我的课程,我将会非常开心~<br>
|
||||
> 课程售价非常非常的便宜,几杯咖啡的事儿.<br>
|
||||
> 课程介绍飞书文档链接:https://relakkes.feishu.cn/wiki/JUgBwdhIeiSbAwkFCLkciHdAnhh
|
||||
|
||||
|
||||
|
||||
## 感谢下列Sponsors对本仓库赞助
|
||||
- 感谢 [JetBrains](https://www.jetbrains.com/?from=gaowei-space/markdown-blog) 对本项目的支持!
|
||||
<a href="https://www.jetbrains.com/?from=NanmiCoder/MediaCrawler" target="_blank">
|
||||
<img src="https://resources.jetbrains.com/storage/products/company/brand/logos/jb_beam.png" width="100" height="100">
|
||||
</a>
|
||||
<br>
|
||||
- <a href="https://sider.ai/ad-land-redirect?source=github&p1=mi&p2=kk">通过注册这个款免费的GPT助手,帮我获取GPT4额度作为支持。也是我每天在用的一款chrome AI助手插件</a>
|
||||
|
||||
成为赞助者,展示你的产品在这里,联系作者:relakkes@gmail.com
|
||||
|
||||
|
||||
## MediaCrawler爬虫项目交流群:
|
||||
> 扫描下方我的个人微信,备注:github,拉你进MediaCrawler项目交流群(请一定备注:github,会有wx小助手自动拉群)
|
||||
>
|
||||
> 如果图片展示不出来,可以直接添加我的微信号:yzglan
|
||||
|
||||
<div style="max-width: 200px">
|
||||
<p><img alt="relakkes_wechat" src="static/images/relakkes_weichat.JPG" style="width: 200px;height: 100%" ></p>
|
||||
</div>
|
||||
|
||||
|
||||
## 运行报错常见问题Q&A
|
||||
> 遇到问题先自行搜索解决下,现在AI很火,用ChatGPT大多情况下能解决你的问题 [免费的ChatGPT](https://sider.ai/ad-land-redirect?source=github&p1=mi&p2=kk)
|
||||
|
||||
➡️➡️➡️ [常见问题](docs/常见问题.md)
|
||||
|
||||
dy和xhs使用Playwright登录现在会出现滑块验证 + 短信验证,手动过一下
|
||||
|
||||
## 项目代码结构
|
||||
➡️➡️➡️ [项目代码结构说明](docs/项目代码结构.md)
|
||||
|
||||
## 代理IP使用说明
|
||||
➡️➡️➡️ [代理IP使用说明](docs/代理使用.md)
|
||||
|
||||
## 词云图相关操作说明
|
||||
➡️➡️➡️ [词云图相关说明](docs/关于词云图相关操作.md)
|
||||
|
||||
## 手机号登录说明
|
||||
➡️➡️➡️ [手机号登录说明](docs/手机号登录说明.md)
|
||||
|
||||
|
||||
## 打赏
|
||||
免费开源不易,如果项目帮到你了,可以给我打赏哦,您的支持就是我最大的动力!
|
||||
<div style="display: flex;justify-content: space-between;width: 100%">
|
||||
<p><img alt="打赏-微信" src="static/images/wechat_pay.jpeg" style="width: 200px;height: 100%" ></p>
|
||||
<p><img alt="打赏-支付宝" src="static/images/zfb_pay.png" style="width: 200px;height: 100%" ></p>
|
||||
</div>
|
||||
|
||||
## 爬虫入门课程
|
||||
我新开的爬虫教程Github仓库 [CrawlerTutorial](https://github.com/NanmiCoder/CrawlerTutorial) ,感兴趣的朋友可以关注一下,持续更新,主打一个免费.
|
||||
|
||||
|
||||
## 项目贡献者
|
||||
> 感谢你们的贡献,让项目变得更好!(贡献比较多的可以加我wx,免费拉你进我的知识星球,后期还有一些其他福利。)
|
||||
<!-- readme: contributors -start -->
|
||||
<table>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td align="center">
|
||||
<a href="https://github.com/NanmiCoder">
|
||||
<img src="https://avatars.githubusercontent.com/u/47178017?v=4" width="100;" alt="NanmiCoder"/>
|
||||
<br />
|
||||
<sub><b>程序员阿江-Relakkes</b></sub>
|
||||
</a>
|
||||
</td>
|
||||
<td align="center">
|
||||
<a href="https://github.com/leantli">
|
||||
<img src="https://avatars.githubusercontent.com/u/117699758?v=4" width="100;" alt="leantli"/>
|
||||
<br />
|
||||
<sub><b>leantli</b></sub>
|
||||
</a>
|
||||
</td>
|
||||
<td align="center">
|
||||
<a href="https://github.com/Rosyrain">
|
||||
<img src="https://avatars.githubusercontent.com/u/116946548?v=4" width="100;" alt="Rosyrain"/>
|
||||
<br />
|
||||
<sub><b>Rosyrain</b></sub>
|
||||
</a>
|
||||
</td>
|
||||
<td align="center">
|
||||
<a href="https://github.com/BaoZhuhan">
|
||||
<img src="https://avatars.githubusercontent.com/u/140676370?v=4" width="100;" alt="BaoZhuhan"/>
|
||||
<br />
|
||||
<sub><b>Bao Zhuhan</b></sub>
|
||||
</a>
|
||||
</td>
|
||||
<td align="center">
|
||||
<a href="https://github.com/nelzomal">
|
||||
<img src="https://avatars.githubusercontent.com/u/8512926?v=4" width="100;" alt="nelzomal"/>
|
||||
<br />
|
||||
<sub><b>zhounan</b></sub>
|
||||
</a>
|
||||
</td>
|
||||
<td align="center">
|
||||
<a href="https://github.com/Hiro-Lin">
|
||||
<img src="https://avatars.githubusercontent.com/u/40111864?v=4" width="100;" alt="Hiro-Lin"/>
|
||||
<br />
|
||||
<sub><b>HIRO</b></sub>
|
||||
</a>
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td align="center">
|
||||
<a href="https://github.com/PeanutSplash">
|
||||
<img src="https://avatars.githubusercontent.com/u/98582625?v=4" width="100;" alt="PeanutSplash"/>
|
||||
<br />
|
||||
<sub><b>PeanutSplash</b></sub>
|
||||
</a>
|
||||
</td>
|
||||
<td align="center">
|
||||
<a href="https://github.com/Ermeng98">
|
||||
<img src="https://avatars.githubusercontent.com/u/55784769?v=4" width="100;" alt="Ermeng98"/>
|
||||
<br />
|
||||
<sub><b>Ermeng</b></sub>
|
||||
</a>
|
||||
</td>
|
||||
<td align="center">
|
||||
<a href="https://github.com/henryhyn">
|
||||
<img src="https://avatars.githubusercontent.com/u/5162443?v=4" width="100;" alt="henryhyn"/>
|
||||
<br />
|
||||
<sub><b>Henry He</b></sub>
|
||||
</a>
|
||||
</td>
|
||||
<td align="center">
|
||||
<a href="https://github.com/Akiqqqqqqq">
|
||||
<img src="https://avatars.githubusercontent.com/u/51102894?v=4" width="100;" alt="Akiqqqqqqq"/>
|
||||
<br />
|
||||
<sub><b>leonardoqiuyu</b></sub>
|
||||
</a>
|
||||
</td>
|
||||
<td align="center">
|
||||
<a href="https://github.com/jayeeliu">
|
||||
<img src="https://avatars.githubusercontent.com/u/77389?v=4" width="100;" alt="jayeeliu"/>
|
||||
<br />
|
||||
<sub><b>jayeeliu</b></sub>
|
||||
</a>
|
||||
</td>
|
||||
<td align="center">
|
||||
<a href="https://github.com/ZuWard">
|
||||
<img src="https://avatars.githubusercontent.com/u/38209256?v=4" width="100;" alt="ZuWard"/>
|
||||
<br />
|
||||
<sub><b>ZuWard</b></sub>
|
||||
</a>
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td align="center">
|
||||
<a href="https://github.com/Zzendrix">
|
||||
<img src="https://avatars.githubusercontent.com/u/154900254?v=4" width="100;" alt="Zzendrix"/>
|
||||
<br />
|
||||
<sub><b>Zendrix</b></sub>
|
||||
</a>
|
||||
</td>
|
||||
<td align="center">
|
||||
<a href="https://github.com/chunpat">
|
||||
<img src="https://avatars.githubusercontent.com/u/19848304?v=4" width="100;" alt="chunpat"/>
|
||||
<br />
|
||||
<sub><b>zhangzhenpeng</b></sub>
|
||||
</a>
|
||||
</td>
|
||||
<td align="center">
|
||||
<a href="https://github.com/tanpenggood">
|
||||
<img src="https://avatars.githubusercontent.com/u/37927946?v=4" width="100;" alt="tanpenggood"/>
|
||||
<br />
|
||||
<sub><b>Sam Tan</b></sub>
|
||||
</a>
|
||||
</td>
|
||||
<td align="center">
|
||||
<a href="https://github.com/xbsheng">
|
||||
<img src="https://avatars.githubusercontent.com/u/56357338?v=4" width="100;" alt="xbsheng"/>
|
||||
<br />
|
||||
<sub><b>xbsheng</b></sub>
|
||||
</a>
|
||||
</td>
|
||||
<td align="center">
|
||||
<a href="https://github.com/yangrq1018">
|
||||
<img src="https://avatars.githubusercontent.com/u/25074163?v=4" width="100;" alt="yangrq1018"/>
|
||||
<br />
|
||||
<sub><b>Martin</b></sub>
|
||||
</a>
|
||||
</td>
|
||||
<td align="center">
|
||||
<a href="https://github.com/zhihuiio">
|
||||
<img src="https://avatars.githubusercontent.com/u/165655688?v=4" width="100;" alt="zhihuiio"/>
|
||||
<br />
|
||||
<sub><b>zhihuiio</b></sub>
|
||||
</a>
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td align="center">
|
||||
<a href="https://github.com/renaissancezyc">
|
||||
<img src="https://avatars.githubusercontent.com/u/118403818?v=4" width="100;" alt="renaissancezyc"/>
|
||||
<br />
|
||||
<sub><b>Ren</b></sub>
|
||||
</a>
|
||||
</td>
|
||||
<td align="center">
|
||||
<a href="https://github.com/Tianci-King">
|
||||
<img src="https://avatars.githubusercontent.com/u/109196852?v=4" width="100;" alt="Tianci-King"/>
|
||||
<br />
|
||||
<sub><b>Wang Tianci</b></sub>
|
||||
</a>
|
||||
</td>
|
||||
<td align="center">
|
||||
<a href="https://github.com/Styunlen">
|
||||
<img src="https://avatars.githubusercontent.com/u/30810222?v=4" width="100;" alt="Styunlen"/>
|
||||
<br />
|
||||
<sub><b>Styunlen</b></sub>
|
||||
</a>
|
||||
</td>
|
||||
<td align="center">
|
||||
<a href="https://github.com/Schofi">
|
||||
<img src="https://avatars.githubusercontent.com/u/33537727?v=4" width="100;" alt="Schofi"/>
|
||||
<br />
|
||||
<sub><b>Schofi</b></sub>
|
||||
</a>
|
||||
</td>
|
||||
<td align="center">
|
||||
<a href="https://github.com/Klu5ure">
|
||||
<img src="https://avatars.githubusercontent.com/u/166240879?v=4" width="100;" alt="Klu5ure"/>
|
||||
<br />
|
||||
<sub><b>Klu5ure</b></sub>
|
||||
</a>
|
||||
</td>
|
||||
<td align="center">
|
||||
<a href="https://github.com/keeper-jie">
|
||||
<img src="https://avatars.githubusercontent.com/u/33612777?v=4" width="100;" alt="keeper-jie"/>
|
||||
<br />
|
||||
<sub><b>Kermit</b></sub>
|
||||
</a>
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td align="center">
|
||||
<a href="https://github.com/kexinoh">
|
||||
<img src="https://avatars.githubusercontent.com/u/91727108?v=4" width="100;" alt="kexinoh"/>
|
||||
<br />
|
||||
<sub><b>KEXNA</b></sub>
|
||||
</a>
|
||||
</td>
|
||||
<td align="center">
|
||||
<a href="https://github.com/aa65535">
|
||||
<img src="https://avatars.githubusercontent.com/u/5417786?v=4" width="100;" alt="aa65535"/>
|
||||
<br />
|
||||
<sub><b>Jian Chang</b></sub>
|
||||
</a>
|
||||
</td>
|
||||
<td align="center">
|
||||
<a href="https://github.com/522109452">
|
||||
<img src="https://avatars.githubusercontent.com/u/16929874?v=4" width="100;" alt="522109452"/>
|
||||
<br />
|
||||
<sub><b>tianqing</b></sub>
|
||||
</a>
|
||||
</td>
|
||||
</tr>
|
||||
<tbody>
|
||||
</table>
|
||||
<!-- readme: contributors -end -->
|
||||
|
||||
## star 趋势图
|
||||
- 如果该项目对你有帮助,star一下 ❤️❤️❤️
|
||||
|
||||
[![Star History Chart](https://api.star-history.com/svg?repos=NanmiCoder/MediaCrawler&type=Date)](https://star-history.com/#NanmiCoder/MediaCrawler&Date)
|
||||
|
||||
|
||||
|
||||
|
||||
## 参考
|
||||
|
||||
- xhs客户端 [ReaJason的xhs仓库](https://github.com/ReaJason/xhs)
|
||||
- 短信转发 [参考仓库](https://github.com/pppscn/SmsForwarder)
|
||||
- 内网穿透工具 [ngrok](https://ngrok.com/docs/)
|
||||
|
||||
|
||||
|
||||
## 免责声明
|
||||
<div id="disclaimer">
|
||||
|
||||
### 1. 项目目的与性质
|
||||
本项目(以下简称“本项目”)是作为一个技术研究与学习工具而创建的,旨在探索和学习网络数据采集技术。本项目专注于自媒体平台的数据爬取技术研究,旨在提供给学习者和研究者作为技术交流之用。
|
||||
|
||||
### 2. 法律合规性声明
|
||||
本项目开发者(以下简称“开发者”)郑重提醒用户在下载、安装和使用本项目时,严格遵守中华人民共和国相关法律法规,包括但不限于《中华人民共和国网络安全法》、《中华人民共和国反间谍法》等所有适用的国家法律和政策。用户应自行承担一切因使用本项目而可能引起的法律责任。
|
||||
|
||||
### 3. 使用目的限制
|
||||
本项目严禁用于任何非法目的或非学习、非研究的商业行为。本项目不得用于任何形式的非法侵入他人计算机系统,不得用于任何侵犯他人知识产权或其他合法权益的行为。用户应保证其使用本项目的目的纯属个人学习和技术研究,不得用于任何形式的非法活动。
|
||||
|
||||
### 4. 免责声明
|
||||
开发者已尽最大努力确保本项目的正当性及安全性,但不对用户使用本项目可能引起的任何形式的直接或间接损失承担责任。包括但不限于由于使用本项目而导致的任何数据丢失、设备损坏、法律诉讼等。
|
||||
|
||||
### 5. 知识产权声明
|
||||
本项目的知识产权归开发者所有。本项目受到著作权法和国际著作权条约以及其他知识产权法律和条约的保护。用户在遵守本声明及相关法律法规的前提下,可以下载和使用本项目。
|
||||
|
||||
### 6. 最终解释权
|
||||
关于本项目的最终解释权归开发者所有。开发者保留随时更改或更新本免责声明的权利,恕不另行通知。
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
|
||||
|
4
README.txt
Normal file
4
README.txt
Normal file
|
@ -0,0 +1,4 @@
|
|||
小红书核心功能media_platform/xhs/core.py
|
||||
增加了爬推荐的功能:
|
||||
python main.py --platform xhs --lt qrcode --type explore
|
||||
具体函数在core.py中的get_explore函数中
|
96
async_db.py
Normal file
96
async_db.py
Normal file
|
@ -0,0 +1,96 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2024/4/6 14:21
|
||||
# @Desc : 异步Aiomysql的增删改查封装
|
||||
from typing import Any, Dict, List, Union
|
||||
|
||||
import aiomysql
|
||||
|
||||
|
||||
class AsyncMysqlDB:
|
||||
def __init__(self, pool: aiomysql.Pool) -> None:
|
||||
self.__pool = pool
|
||||
|
||||
async def query(self, sql: str, *args: Union[str, int]) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
从给定的 SQL 中查询记录,返回的是一个列表
|
||||
:param sql: 查询的sql
|
||||
:param args: sql中传递动态参数列表
|
||||
:return:
|
||||
"""
|
||||
async with self.__pool.acquire() as conn:
|
||||
async with conn.cursor(aiomysql.DictCursor) as cur:
|
||||
await cur.execute(sql, args)
|
||||
data = await cur.fetchall()
|
||||
return data or []
|
||||
|
||||
async def get_first(self, sql: str, *args: Union[str, int]) -> Union[Dict[str, Any], None]:
|
||||
"""
|
||||
从给定的 SQL 中查询记录,返回的是符合条件的第一个结果
|
||||
:param sql: 查询的sql
|
||||
:param args:sql中传递动态参数列表
|
||||
:return:
|
||||
"""
|
||||
async with self.__pool.acquire() as conn:
|
||||
async with conn.cursor(aiomysql.DictCursor) as cur:
|
||||
await cur.execute(sql, args)
|
||||
data = await cur.fetchone()
|
||||
return data
|
||||
|
||||
async def item_to_table(self, table_name: str, item: Dict[str, Any]) -> int:
|
||||
"""
|
||||
表中插入数据
|
||||
:param table_name: 表名
|
||||
:param item: 一条记录的字典信息
|
||||
:return:
|
||||
"""
|
||||
fields = list(item.keys())
|
||||
values = list(item.values())
|
||||
fields = [f'`{field}`' for field in fields]
|
||||
fieldstr = ','.join(fields)
|
||||
valstr = ','.join(['%s'] * len(item))
|
||||
sql = "INSERT INTO %s (%s) VALUES(%s)" % (table_name, fieldstr, valstr)
|
||||
async with self.__pool.acquire() as conn:
|
||||
async with conn.cursor(aiomysql.DictCursor) as cur:
|
||||
await cur.execute(sql, values)
|
||||
lastrowid = cur.lastrowid
|
||||
return lastrowid
|
||||
|
||||
async def update_table(self, table_name: str, updates: Dict[str, Any], field_where: str,
|
||||
value_where: Union[str, int, float]) -> int:
|
||||
"""
|
||||
更新指定表的记录
|
||||
:param table_name: 表名
|
||||
:param updates: 需要更新的字段和值的 key - value 映射
|
||||
:param field_where: update 语句 where 条件中的字段名
|
||||
:param value_where: update 语句 where 条件中的字段值
|
||||
:return:
|
||||
"""
|
||||
upsets = []
|
||||
values = []
|
||||
for k, v in updates.items():
|
||||
s = '`%s`=%%s' % k
|
||||
upsets.append(s)
|
||||
values.append(v)
|
||||
upsets = ','.join(upsets)
|
||||
sql = 'UPDATE %s SET %s WHERE %s="%s"' % (
|
||||
table_name,
|
||||
upsets,
|
||||
field_where, value_where,
|
||||
)
|
||||
async with self.__pool.acquire() as conn:
|
||||
async with conn.cursor() as cur:
|
||||
rows = await cur.execute(sql, values)
|
||||
return rows
|
||||
|
||||
async def execute(self, sql: str, *args: Union[str, int]) -> int:
|
||||
"""
|
||||
需要更新、写入等操作的 excute 执行语句
|
||||
:param sql:
|
||||
:param args:
|
||||
:return:
|
||||
"""
|
||||
async with self.__pool.acquire() as conn:
|
||||
async with conn.cursor() as cur:
|
||||
rows = await cur.execute(sql, args)
|
||||
return rows
|
0
base/__init__.py
Normal file
0
base/__init__.py
Normal file
71
base/base_crawler.py
Normal file
71
base/base_crawler.py
Normal file
|
@ -0,0 +1,71 @@
|
|||
from abc import ABC, abstractmethod
|
||||
from typing import Dict, Optional
|
||||
|
||||
from playwright.async_api import BrowserContext, BrowserType
|
||||
|
||||
|
||||
class AbstractCrawler(ABC):
|
||||
@abstractmethod
|
||||
async def start(self):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def search(self):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def launch_browser(self, chromium: BrowserType, playwright_proxy: Optional[Dict], user_agent: Optional[str],
|
||||
headless: bool = True) -> BrowserContext:
|
||||
pass
|
||||
|
||||
|
||||
class AbstractLogin(ABC):
|
||||
@abstractmethod
|
||||
async def begin(self):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def login_by_qrcode(self):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def login_by_mobile(self):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def login_by_cookies(self):
|
||||
pass
|
||||
|
||||
|
||||
class AbstractStore(ABC):
|
||||
@abstractmethod
|
||||
async def store_content(self, content_item: Dict):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def store_comment(self, comment_item: Dict):
|
||||
pass
|
||||
|
||||
# TODO support all platform
|
||||
# only xhs is supported, so @abstractmethod is commented
|
||||
# @abstractmethod
|
||||
async def store_creator(self, creator: Dict):
|
||||
pass
|
||||
|
||||
|
||||
class AbstractStoreImage(ABC):
|
||||
# TODO: support all platform
|
||||
# only weibo is supported
|
||||
# @abstractmethod
|
||||
async def store_image(self, image_content_item: Dict):
|
||||
pass
|
||||
|
||||
|
||||
class AbstractApiClient(ABC):
|
||||
@abstractmethod
|
||||
async def request(self, method, url, **kwargs):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def update_cookies(self, browser_context: BrowserContext):
|
||||
pass
|
0
cache/__init__.py
vendored
Normal file
0
cache/__init__.py
vendored
Normal file
42
cache/abs_cache.py
vendored
Normal file
42
cache/abs_cache.py
vendored
Normal file
|
@ -0,0 +1,42 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Name : 程序员阿江-Relakkes
|
||||
# @Time : 2024/6/2 11:06
|
||||
# @Desc : 抽象类
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any, List, Optional
|
||||
|
||||
|
||||
class AbstractCache(ABC):
|
||||
|
||||
@abstractmethod
|
||||
def get(self, key: str) -> Optional[Any]:
|
||||
"""
|
||||
从缓存中获取键的值。
|
||||
这是一个抽象方法。子类必须实现这个方法。
|
||||
:param key: 键
|
||||
:return:
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def set(self, key: str, value: Any, expire_time: int) -> None:
|
||||
"""
|
||||
将键的值设置到缓存中。
|
||||
这是一个抽象方法。子类必须实现这个方法。
|
||||
:param key: 键
|
||||
:param value: 值
|
||||
:param expire_time: 过期时间
|
||||
:return:
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def keys(self, pattern: str) -> List[str]:
|
||||
"""
|
||||
获取所有符合pattern的key
|
||||
:param pattern: 匹配模式
|
||||
:return:
|
||||
"""
|
||||
raise NotImplementedError
|
29
cache/cache_factory.py
vendored
Normal file
29
cache/cache_factory.py
vendored
Normal file
|
@ -0,0 +1,29 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Name : 程序员阿江-Relakkes
|
||||
# @Time : 2024/6/2 11:23
|
||||
# @Desc :
|
||||
|
||||
|
||||
class CacheFactory:
|
||||
"""
|
||||
缓存工厂类
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def create_cache(cache_type: str, *args, **kwargs):
|
||||
"""
|
||||
创建缓存对象
|
||||
:param cache_type: 缓存类型
|
||||
:param args: 参数
|
||||
:param kwargs: 关键字参数
|
||||
:return:
|
||||
"""
|
||||
if cache_type == 'memory':
|
||||
from .local_cache import ExpiringLocalCache
|
||||
return ExpiringLocalCache(*args, **kwargs)
|
||||
elif cache_type == 'redis':
|
||||
from .redis_cache import RedisCache
|
||||
return RedisCache()
|
||||
else:
|
||||
raise ValueError(f'Unknown cache type: {cache_type}')
|
120
cache/local_cache.py
vendored
Normal file
120
cache/local_cache.py
vendored
Normal file
|
@ -0,0 +1,120 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Name : 程序员阿江-Relakkes
|
||||
# @Time : 2024/6/2 11:05
|
||||
# @Desc : 本地缓存
|
||||
|
||||
import asyncio
|
||||
import time
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
from cache.abs_cache import AbstractCache
|
||||
|
||||
|
||||
class ExpiringLocalCache(AbstractCache):
|
||||
|
||||
def __init__(self, cron_interval: int = 10):
|
||||
"""
|
||||
初始化本地缓存
|
||||
:param cron_interval: 定时清楚cache的时间间隔
|
||||
:return:
|
||||
"""
|
||||
self._cron_interval = cron_interval
|
||||
self._cache_container: Dict[str, Tuple[Any, float]] = {}
|
||||
self._cron_task: Optional[asyncio.Task] = None
|
||||
# 开启定时清理任务
|
||||
self._schedule_clear()
|
||||
|
||||
def __del__(self):
|
||||
"""
|
||||
析构函数,清理定时任务
|
||||
:return:
|
||||
"""
|
||||
if self._cron_task is not None:
|
||||
self._cron_task.cancel()
|
||||
|
||||
def get(self, key: str) -> Optional[Any]:
|
||||
"""
|
||||
从缓存中获取键的值
|
||||
:param key:
|
||||
:return:
|
||||
"""
|
||||
value, expire_time = self._cache_container.get(key, (None, 0))
|
||||
if value is None:
|
||||
return None
|
||||
|
||||
# 如果键已过期,则删除键并返回None
|
||||
if expire_time < time.time():
|
||||
del self._cache_container[key]
|
||||
return None
|
||||
|
||||
return value
|
||||
|
||||
def set(self, key: str, value: Any, expire_time: int) -> None:
|
||||
"""
|
||||
将键的值设置到缓存中
|
||||
:param key:
|
||||
:param value:
|
||||
:param expire_time:
|
||||
:return:
|
||||
"""
|
||||
self._cache_container[key] = (value, time.time() + expire_time)
|
||||
|
||||
def keys(self, pattern: str) -> List[str]:
|
||||
"""
|
||||
获取所有符合pattern的key
|
||||
:param pattern: 匹配模式
|
||||
:return:
|
||||
"""
|
||||
if pattern == '*':
|
||||
return list(self._cache_container.keys())
|
||||
|
||||
# 本地缓存通配符暂时将*替换为空
|
||||
if '*' in pattern:
|
||||
pattern = pattern.replace('*', '')
|
||||
|
||||
return [key for key in self._cache_container.keys() if pattern in key]
|
||||
|
||||
def _schedule_clear(self):
|
||||
"""
|
||||
开启定时清理任务,
|
||||
:return:
|
||||
"""
|
||||
|
||||
try:
|
||||
loop = asyncio.get_event_loop()
|
||||
except RuntimeError:
|
||||
loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(loop)
|
||||
|
||||
self._cron_task = loop.create_task(self._start_clear_cron())
|
||||
|
||||
def _clear(self):
|
||||
"""
|
||||
根据过期时间清理缓存
|
||||
:return:
|
||||
"""
|
||||
for key, (value, expire_time) in self._cache_container.items():
|
||||
if expire_time < time.time():
|
||||
del self._cache_container[key]
|
||||
|
||||
async def _start_clear_cron(self):
|
||||
"""
|
||||
开启定时清理任务
|
||||
:return:
|
||||
"""
|
||||
while True:
|
||||
self._clear()
|
||||
await asyncio.sleep(self._cron_interval)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
cache = ExpiringLocalCache(cron_interval=2)
|
||||
cache.set('name', '程序员阿江-Relakkes', 3)
|
||||
print(cache.get('key'))
|
||||
print(cache.keys("*"))
|
||||
time.sleep(4)
|
||||
print(cache.get('key'))
|
||||
del cache
|
||||
time.sleep(1)
|
||||
print("done")
|
76
cache/redis_cache.py
vendored
Normal file
76
cache/redis_cache.py
vendored
Normal file
|
@ -0,0 +1,76 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Name : 程序员阿江-Relakkes
|
||||
# @Time : 2024/5/29 22:57
|
||||
# @Desc : RedisCache实现
|
||||
import pickle
|
||||
import time
|
||||
from typing import Any, List
|
||||
|
||||
from redis import Redis
|
||||
|
||||
from cache.abs_cache import AbstractCache
|
||||
from config import db_config
|
||||
|
||||
|
||||
class RedisCache(AbstractCache):
|
||||
|
||||
def __init__(self) -> None:
|
||||
# 连接redis, 返回redis客户端
|
||||
self._redis_client = self._connet_redis()
|
||||
|
||||
@staticmethod
|
||||
def _connet_redis() -> Redis:
|
||||
"""
|
||||
连接redis, 返回redis客户端, 这里按需配置redis连接信息
|
||||
:return:
|
||||
"""
|
||||
return Redis(
|
||||
host=db_config.REDIS_DB_HOST,
|
||||
port=db_config.REDIS_DB_PORT,
|
||||
db=db_config.REDIS_DB_NUM,
|
||||
password=db_config.REDIS_DB_PWD,
|
||||
)
|
||||
|
||||
def get(self, key: str) -> Any:
|
||||
"""
|
||||
从缓存中获取键的值, 并且反序列化
|
||||
:param key:
|
||||
:return:
|
||||
"""
|
||||
value = self._redis_client.get(key)
|
||||
if value is None:
|
||||
return None
|
||||
return pickle.loads(value)
|
||||
|
||||
def set(self, key: str, value: Any, expire_time: int) -> None:
|
||||
"""
|
||||
将键的值设置到缓存中, 并且序列化
|
||||
:param key:
|
||||
:param value:
|
||||
:param expire_time:
|
||||
:return:
|
||||
"""
|
||||
self._redis_client.set(key, pickle.dumps(value), ex=expire_time)
|
||||
|
||||
def keys(self, pattern: str) -> List[str]:
|
||||
"""
|
||||
获取所有符合pattern的key
|
||||
"""
|
||||
return [key.decode() for key in self._redis_client.keys(pattern)]
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
redis_cache = RedisCache()
|
||||
# basic usage
|
||||
redis_cache.set("name", "程序员阿江-Relakkes", 1)
|
||||
print(redis_cache.get("name")) # Relakkes
|
||||
print(redis_cache.keys("*")) # ['name']
|
||||
time.sleep(2)
|
||||
print(redis_cache.get("name")) # None
|
||||
|
||||
# special python type usage
|
||||
# list
|
||||
redis_cache.set("list", [1, 2, 3], 10)
|
||||
_value = redis_cache.get("list")
|
||||
print(_value, f"value type:{type(_value)}") # [1, 2, 3]
|
1
cmd_arg/__init__.py
Normal file
1
cmd_arg/__init__.py
Normal file
|
@ -0,0 +1 @@
|
|||
from .arg import *
|
40
cmd_arg/arg.py
Normal file
40
cmd_arg/arg.py
Normal file
|
@ -0,0 +1,40 @@
|
|||
import argparse
|
||||
|
||||
import config
|
||||
from tools.utils import str2bool
|
||||
|
||||
|
||||
async def parse_cmd():
|
||||
# 读取command arg
|
||||
parser = argparse.ArgumentParser(description='Media crawler program.')
|
||||
parser.add_argument('--platform', type=str, help='Media platform select (xhs | dy | ks | bili | wb)',
|
||||
choices=["xhs", "dy", "ks", "bili", "wb"], default=config.PLATFORM)
|
||||
parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | cookie)',
|
||||
choices=["qrcode", "phone", "cookie"], default=config.LOGIN_TYPE)
|
||||
parser.add_argument('--type', type=str, help='crawler type (search | detail | creator)',
|
||||
choices=["search", "detail", "creator", "explore"], default=config.CRAWLER_TYPE)
|
||||
parser.add_argument('--start', type=int,
|
||||
help='number of start page', default=config.START_PAGE)
|
||||
parser.add_argument('--keywords', type=str,
|
||||
help='please input keywords', default=config.KEYWORDS)
|
||||
parser.add_argument('--get_comment', type=str2bool,
|
||||
help='''whether to crawl level one comment, supported values case insensitive ('yes', 'true', 't', 'y', '1', 'no', 'false', 'f', 'n', '0')''', default=config.ENABLE_GET_COMMENTS)
|
||||
parser.add_argument('--get_sub_comment', type=str2bool,
|
||||
help=''''whether to crawl level two comment, supported values case insensitive ('yes', 'true', 't', 'y', '1', 'no', 'false', 'f', 'n', '0')''', default=config.ENABLE_GET_SUB_COMMENTS)
|
||||
parser.add_argument('--save_data_option', type=str,
|
||||
help='where to save the data (csv or db or json)', choices=['csv', 'db', 'json'], default=config.SAVE_DATA_OPTION)
|
||||
parser.add_argument('--cookies', type=str,
|
||||
help='cookies used for cookie login type', default=config.COOKIES)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# override config
|
||||
config.PLATFORM = args.platform
|
||||
config.LOGIN_TYPE = args.lt
|
||||
config.CRAWLER_TYPE = args.type
|
||||
config.START_PAGE = args.start
|
||||
config.KEYWORDS = args.keywords
|
||||
config.ENABLE_GET_COMMENTS = args.get_comment
|
||||
config.ENABLE_GET_SUB_COMMENTS = args.get_sub_comment
|
||||
config.SAVE_DATA_OPTION = args.save_data_option
|
||||
config.COOKIES = args.cookies
|
2
config/__init__.py
Normal file
2
config/__init__.py
Normal file
|
@ -0,0 +1,2 @@
|
|||
from .base_config import *
|
||||
from .db_config import *
|
131
config/base_config.py
Normal file
131
config/base_config.py
Normal file
|
@ -0,0 +1,131 @@
|
|||
# 基础配置
|
||||
PLATFORM = "xhs"
|
||||
KEYWORDS = "python,golang"
|
||||
LOGIN_TYPE = "qrcode" # qrcode or phone or cookie
|
||||
COOKIES = ""
|
||||
# 具体值参见media_platform.xxx.field下的枚举值,暂时只支持小红书
|
||||
SORT_TYPE = "popularity_descending"
|
||||
# 具体值参见media_platform.xxx.field下的枚举值,暂时只支持抖音
|
||||
PUBLISH_TIME_TYPE = 0
|
||||
CRAWLER_TYPE = "search" # 爬取类型,search(关键词搜索) | detail(帖子详情)| creator(创作者主页数据)
|
||||
|
||||
# 是否开启 IP 代理
|
||||
ENABLE_IP_PROXY = False
|
||||
|
||||
# 代理IP池数量
|
||||
IP_PROXY_POOL_COUNT = 2
|
||||
|
||||
# 代理IP提供商名称
|
||||
IP_PROXY_PROVIDER_NAME = "kuaidaili"
|
||||
|
||||
# 设置为True不会打开浏览器(无头浏览器)
|
||||
# 设置False会打开一个浏览器
|
||||
# 小红书如果一直扫码登录不通过,打开浏览器手动过一下滑动验证码
|
||||
# 抖音如果一直提示失败,打开浏览器看下是否扫码登录之后出现了手机号验证,如果出现了手动过一下再试。
|
||||
HEADLESS = False
|
||||
|
||||
# 是否保存登录状态
|
||||
SAVE_LOGIN_STATE = True
|
||||
|
||||
# 数据保存类型选项配置,支持三种类型:csv、db、json
|
||||
SAVE_DATA_OPTION = "json" # csv or db or json
|
||||
|
||||
# 用户浏览器缓存的浏览器文件配置
|
||||
USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name
|
||||
|
||||
# 爬取开始页数 默认从第一页开始
|
||||
START_PAGE = 1
|
||||
|
||||
# 爬取视频/帖子的数量控制
|
||||
CRAWLER_MAX_NOTES_COUNT = 20
|
||||
|
||||
# 并发爬虫数量控制
|
||||
MAX_CONCURRENCY_NUM = 4
|
||||
|
||||
# 是否开启爬图片模式, 默认不开启爬图片
|
||||
ENABLE_GET_IMAGES = False
|
||||
|
||||
# 是否开启爬评论模式, 默认不开启爬评论
|
||||
ENABLE_GET_COMMENTS = False
|
||||
|
||||
# 是否开启爬二级评论模式, 默认不开启爬二级评论, 目前仅支持 xhs, bilibili
|
||||
# 老版本项目使用了 db, 则需参考 schema/tables.sql line 287 增加表字段
|
||||
ENABLE_GET_SUB_COMMENTS = False
|
||||
|
||||
# 指定小红书需要爬虫的笔记ID列表
|
||||
# 667a0c27000000001e010d42
|
||||
XHS_SPECIFIED_ID_LIST = [
|
||||
"6422c2750000000027000d88",
|
||||
"64ca1b73000000000b028dd2",
|
||||
"630d5b85000000001203ab41",
|
||||
# ........................
|
||||
]
|
||||
|
||||
# 指定抖音需要爬取的ID列表
|
||||
DY_SPECIFIED_ID_LIST = [
|
||||
"7280854932641664319",
|
||||
"7202432992642387233"
|
||||
# ........................
|
||||
]
|
||||
|
||||
# 指定快手平台需要爬取的ID列表
|
||||
KS_SPECIFIED_ID_LIST = [
|
||||
"3xf8enb8dbj6uig",
|
||||
"3x6zz972bchmvqe"
|
||||
]
|
||||
|
||||
# 指定B站平台需要爬取的视频bvid列表
|
||||
BILI_SPECIFIED_ID_LIST = [
|
||||
"BV1d54y1g7db",
|
||||
"BV1Sz4y1U77N",
|
||||
"BV14Q4y1n7jz",
|
||||
# ........................
|
||||
]
|
||||
|
||||
# 指定微博平台需要爬取的帖子列表
|
||||
WEIBO_SPECIFIED_ID_LIST = [
|
||||
"4982041758140155",
|
||||
# ........................
|
||||
]
|
||||
|
||||
# 指定小红书创作者ID列表
|
||||
XHS_CREATOR_ID_LIST = [
|
||||
"5c4548d80000000006030727",
|
||||
# "63e36c9a000000002703502b",
|
||||
# ........................
|
||||
]
|
||||
|
||||
# 指定Dy创作者ID列表(sec_id)
|
||||
DY_CREATOR_ID_LIST = [
|
||||
"MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE",
|
||||
# ........................
|
||||
]
|
||||
|
||||
# 指定bili创作者ID列表(sec_id)
|
||||
BILI_CREATOR_ID_LIST = [
|
||||
"20813884",
|
||||
# ........................
|
||||
]
|
||||
|
||||
# 指定快手创作者ID列表
|
||||
KS_CREATOR_ID_LIST = [
|
||||
"3x4sm73aye7jq7i",
|
||||
# ........................
|
||||
]
|
||||
|
||||
|
||||
#词云相关
|
||||
#是否开启生成评论词云图
|
||||
ENABLE_GET_WORDCLOUD = False
|
||||
# 自定义词语及其分组
|
||||
#添加规则:xx:yy 其中xx为自定义添加的词组,yy为将xx该词组分到的组名。
|
||||
CUSTOM_WORDS = {
|
||||
'零几': '年份', # 将“零几”识别为一个整体
|
||||
'高频词': '专业术语' # 示例自定义词
|
||||
}
|
||||
|
||||
#停用(禁用)词文件路径
|
||||
STOP_WORDS_FILE = "./docs/hit_stopwords.txt"
|
||||
|
||||
#中文字体文件路径
|
||||
FONT_PATH= "./docs/STZHONGS.TTF"
|
20
config/db_config.py
Normal file
20
config/db_config.py
Normal file
|
@ -0,0 +1,20 @@
|
|||
import os
|
||||
|
||||
# mysql config
|
||||
RELATION_DB_PWD = os.getenv("RELATION_DB_PWD", "123456")
|
||||
RELATION_DB_USER = os.getenv("RELATION_DB_USER", "root")
|
||||
RELATION_DB_HOST = os.getenv("RELATION_DB_HOST", "localhost")
|
||||
RELATION_DB_PORT = os.getenv("RELATION_DB_PORT", "3306")
|
||||
RELATION_DB_NAME = os.getenv("RELATION_DB_NAME", "media_crawler")
|
||||
|
||||
RELATION_DB_URL = f"mysql://{RELATION_DB_USER}:{RELATION_DB_PWD}@{RELATION_DB_HOST}:{RELATION_DB_PORT}/{RELATION_DB_NAME}"
|
||||
|
||||
# redis config
|
||||
REDIS_DB_HOST = "127.0.0.1" # your redis host
|
||||
REDIS_DB_PWD = os.getenv("REDIS_DB_PWD", "123456") # your redis password
|
||||
REDIS_DB_PORT = os.getenv("REDIS_DB_PORT", 6379) # your redis port
|
||||
REDIS_DB_NUM = os.getenv("REDIS_DB_NUM", 0) # your redis db num
|
||||
|
||||
# cache type
|
||||
CACHE_TYPE_REDIS = "redis"
|
||||
CACHE_TYPE_MEMORY = "memory"
|
96
db.py
Normal file
96
db.py
Normal file
|
@ -0,0 +1,96 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2024/4/6 14:54
|
||||
# @Desc : mediacrawler db 管理
|
||||
import asyncio
|
||||
from typing import Dict
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import aiofiles
|
||||
import aiomysql
|
||||
|
||||
import config
|
||||
from async_db import AsyncMysqlDB
|
||||
from tools import utils
|
||||
from var import db_conn_pool_var, media_crawler_db_var
|
||||
|
||||
|
||||
def parse_mysql_url(mysql_url) -> Dict:
|
||||
"""
|
||||
从配置文件中解析db链接url,给到aiomysql用,因为aiomysql不支持直接以URL的方式传递链接信息。
|
||||
Args:
|
||||
mysql_url: mysql://root:{RELATION_DB_PWD}@localhost:3306/media_crawler
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
parsed_url = urlparse(mysql_url)
|
||||
db_params = {
|
||||
'host': parsed_url.hostname,
|
||||
'port': parsed_url.port or 3306,
|
||||
'user': parsed_url.username,
|
||||
'password': parsed_url.password,
|
||||
'db': parsed_url.path.lstrip('/')
|
||||
}
|
||||
return db_params
|
||||
|
||||
|
||||
async def init_mediacrawler_db():
|
||||
"""
|
||||
初始化数据库链接池对象,并将该对象塞给media_crawler_db_var上下文变量
|
||||
Returns:
|
||||
|
||||
"""
|
||||
db_conn_params = parse_mysql_url(config.RELATION_DB_URL)
|
||||
pool = await aiomysql.create_pool(
|
||||
autocommit=True,
|
||||
**db_conn_params
|
||||
)
|
||||
async_db_obj = AsyncMysqlDB(pool)
|
||||
|
||||
# 将连接池对象和封装的CRUD sql接口对象放到上下文变量中
|
||||
db_conn_pool_var.set(pool)
|
||||
media_crawler_db_var.set(async_db_obj)
|
||||
|
||||
|
||||
async def init_db():
|
||||
"""
|
||||
初始化db连接池
|
||||
Returns:
|
||||
|
||||
"""
|
||||
utils.logger.info("[init_db] start init mediacrawler db connect object")
|
||||
await init_mediacrawler_db()
|
||||
utils.logger.info("[init_db] end init mediacrawler db connect object")
|
||||
|
||||
|
||||
async def close():
|
||||
"""
|
||||
关闭连接池
|
||||
Returns:
|
||||
|
||||
"""
|
||||
utils.logger.info("[close] close mediacrawler db pool")
|
||||
db_pool: aiomysql.Pool = db_conn_pool_var.get()
|
||||
if db_pool is not None:
|
||||
db_pool.close()
|
||||
|
||||
|
||||
async def init_table_schema():
|
||||
"""
|
||||
用来初始化数据库表结构,请在第一次需要创建表结构的时候使用,多次执行该函数会将已有的表以及数据全部删除
|
||||
Returns:
|
||||
|
||||
"""
|
||||
utils.logger.info("[init_table_schema] begin init mysql table schema ...")
|
||||
await init_mediacrawler_db()
|
||||
async_db_obj: AsyncMysqlDB = media_crawler_db_var.get()
|
||||
async with aiofiles.open("schema/tables.sql", mode="r") as f:
|
||||
schema_sql = await f.read()
|
||||
await async_db_obj.execute(schema_sql)
|
||||
utils.logger.info("[init_table_schema] mediacrawler table schema init successful")
|
||||
await close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
asyncio.get_event_loop().run_until_complete(init_table_schema())
|
BIN
docs/STZHONGS.TTF
Normal file
BIN
docs/STZHONGS.TTF
Normal file
Binary file not shown.
768
docs/hit_stopwords.txt
Normal file
768
docs/hit_stopwords.txt
Normal file
|
@ -0,0 +1,768 @@
|
|||
\n
|
||||
———
|
||||
》),
|
||||
)÷(1-
|
||||
”,
|
||||
)、
|
||||
=(
|
||||
:
|
||||
→
|
||||
℃
|
||||
&
|
||||
*
|
||||
一一
|
||||
~~~~
|
||||
’
|
||||
.
|
||||
『
|
||||
.一
|
||||
./
|
||||
--
|
||||
』
|
||||
=″
|
||||
【
|
||||
[*]
|
||||
}>
|
||||
[⑤]]
|
||||
[①D]
|
||||
c]
|
||||
ng昉
|
||||
*
|
||||
//
|
||||
[
|
||||
]
|
||||
[②e]
|
||||
[②g]
|
||||
={
|
||||
}
|
||||
,也
|
||||
‘
|
||||
A
|
||||
[①⑥]
|
||||
[②B]
|
||||
[①a]
|
||||
[④a]
|
||||
[①③]
|
||||
[③h]
|
||||
③]
|
||||
1.
|
||||
--
|
||||
[②b]
|
||||
’‘
|
||||
×××
|
||||
[①⑧]
|
||||
0:2
|
||||
=[
|
||||
[⑤b]
|
||||
[②c]
|
||||
[④b]
|
||||
[②③]
|
||||
[③a]
|
||||
[④c]
|
||||
[①⑤]
|
||||
[①⑦]
|
||||
[①g]
|
||||
∈[
|
||||
[①⑨]
|
||||
[①④]
|
||||
[①c]
|
||||
[②f]
|
||||
[②⑧]
|
||||
[②①]
|
||||
[①C]
|
||||
[③c]
|
||||
[③g]
|
||||
[②⑤]
|
||||
[②②]
|
||||
一.
|
||||
[①h]
|
||||