再也不用花钱买漫画!Python爬取某漫画的脚本及源码

一、工具

python3
第三方类库requests
python3-pyqt5(GUI依赖,不用GUI可不装)

ubuntu系列系统使用以下命令安装依赖:

URL格式: 漫画首页的URL,如http://m.ac.qq.com/Comic/view/id/518333(移动版) 或 http://ac.qq.com/Comic/comicInfo/id/17114, http://ac.qq.com/naruto(PC版)

注意: 火影忍者彩漫需要访问m.ac.qq.com搜索火影忍者,因为PC端页面火影忍者彩漫和黑白漫画是一个id一个url。

二、命令行帮助

usage: getComic.py [-h] [-u URL] [-p PATH] [-d] [-l LIST]
 
*下载腾讯漫画,仅供学习交流,请勿用于非法用途*
空参运行进入交互式模式运行。
 
optional arguments:
  -h, --help            show this help message and exit
  -u URL, --url URL     要下载的漫画的首页,可以下载以下类型的url: 
                        http://ac.qq.com/Comic/comicInfo/id/511915
                        http://m.ac.qq.com/Comic/comicInfo/id/505430
                        http://pad.ac.qq.com/Comic/comicInfo/id/505430
                        http://ac.qq.com/naruto
  -p PATH, --path PATH  漫画下载路径。 默认: /home/fengyu/tencent_comic
  -d, --dir             将所有图片下载到一个目录(适合腾讯漫画等软件连看使用)
  -l LIST, --list LIST  要下载的漫画章节列表,不指定则下载所有章节。格式范例: 
                        N - 下载具体某一章节,如-l 1, 下载第1章
                        N,N... - 下载某几个不连续的章节,如 \"-l 1,3,5\", 下载1,3,5章
                        N-N... - 下载某一段连续的章节,如 \"-l 10-50\", 下载[10,50]章
                        杂合型 - 结合上面所有的规则,如 \"-l 1,3,5-7,11-111\"

三、GUI预览效果

支持不连续的章节选择下载

windows预览效果:

再也不用花钱买漫画!Python爬取某漫画的脚本及源码再也不用花钱买漫画!Python爬取某漫画的脚本及源码

deepin/Linux 预览效果:

再也不用花钱买漫画!Python爬取某漫画的脚本及源码

四、全部源码

import requests
import re
import json
import os
import argparse
 
requestSession = requests.session()
UA = \'Mozilla/5.0 (iPad; CPU OS 5_1 like Mac OS X; en-us) \\
        AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 \\
        Mobile/9B176 Safari/7534.48.3\' # ipad UA
requestSession.headers.update({\'User-Agent\': UA})
 
class ErrorCode(Exception):
    \'\'\'自定义错误码:
        1: URL不正确
        2: URL无法跳转为移动端URL
        3: 中断下载\'\'\'
    def __init__(self, code):
        self.code = code
 
    def __str__(self):
        return repr(self.code)
 
def isLegelUrl(url):
    legal_url_list = [
        re.compile(r\'^http://ac.qq.com/Comic/[Cc]omicInfo/id/\\d+/?$\'),
        re.compile(r\'^http://m.ac.qq.com/Comic/[Cc]omicInfo/id/\\d+/?$\'),
        re.compile(r\'^http://ac.qq.com/\\w+/?$\'),
        re.compile(r\'^http://pad.ac.qq.com/Comic/[Cc]omicInfo/id/\\d+/?$\')
    ]
 
    for legal_url in legal_url_list:
        if legal_url.match(url):
            return True
    return False
 
def getId(url):
    if not isLegelUrl(url):
        print(\'请输入正确的url!具体支持的url请在命令行输入-h|--help参数查看帮助文档。\')
        raise ErrorCode(1)
 
    numRE = re.compile(r\'\\d+$\')
    
    id = numRE.findall(url)
    if not id:
        get_id_request = requestSession.get(url)
        url = get_id_request.url
        id = numRE.findall(url)
        if not isLegelUrl(url) or not id:
            print(\'无法自动跳转移动端URL,请进入http://m.ac.qq.com,找到\'
            \'该漫画地址。\\n\'
            \'地址应该像这样: \'
            \'http://m.ac.qq.com/Comic/comicInfo/id/xxxxx (xxxxx为整数)\')
            raise ErrorCode(2)
 
    return id[0]
 
def getContent(id):
    getComicInfoUrl = \'http://pad.ac.qq.com/GetData/getComicInfo?id={}\'.format(id)
    requestSession.headers.update({\'Cookie\': \'ac_refer=http://pad.ac.qq.com\'})
    requestSession.headers.update({\'Referer\': \'http://pad.ac.qq.com\'})
    getComicInfo = requestSession.get(getComicInfoUrl)
    comicInfoJson = getComicInfo.text
    comicInfo = json.loads(comicInfoJson)
    comicName = comicInfo[\'title\']
    comicIntrd = comicInfo[\'brief_intrd\']
    getChapterListUrl = \'http://pad.ac.qq.com/GetData/getChapterList?id={}\'.format(id)
    getChapterList = requestSession.get(getChapterListUrl)
    contentJson = json.loads(getChapterList.text)
    count = contentJson[\'length\']
    sortedContentList = []
    for i in range(count + 1):
        for item in contentJson:
            if isinstance(contentJson[item], dict) and contentJson[item].get(\'seq\') == i:
                sortedContentList.append({item: contentJson[item]})
                break
    return (comicName, comicIntrd, count, sortedContentList)
 
def getImgList(contentJson, id):
    cid = list(contentJson.keys())[0]
    getPicHashURL = \'http://pad.ac.qq.com/View/mGetPicHash?id={}&cid={}\'.format(id, cid)
    picJsonPage = requestSession.get(getPicHashURL).text
    picJson = json.loads(picJsonPage)
    count = picJson[\'pCount\']    #统计图片数量
    pHash = picJson[\'pHash\']
    sortedImgDictList = []
    for i in range(1, count + 1):
        for item in pHash:
            if pHash[item][\'seq\'] == i:
                sortedImgDictList.append(pHash[item])
                break
    imgList = []
    for imgDict in sortedImgDictList:
        k = imgDict[\'cid\']
        m = imgDict[\'pid\']
        j = int(id)
        uin = max(j + k + m, 10001)
        l = [j % 1000 // 100, j % 100, j, k]
        n = \'/mif800/\' + \'/\'.join(str(j) for j in l) + \'/\'
        h = str(m) + \'.mif2\'
        g=\"http://ac.tc.qq.com/store_file_download?buid=15017&uin=\"+str(uin)+\"&dir_path=\"+n+\"&name=\"+h
        imgList.append(g)
    return imgList
 
def downloadImg(imgUrlList, contentPath, one_folder=False):
    count = len(imgUrlList)
    print(\'该集漫画共计{}张图片\'.format(count))
    i = 1
 
    for imgUrl in imgUrlList:
        print(\'\\r正在下载第{}张图片...\'.format(i), end = \'\')
        if not one_folder:
            imgPath = os.path.join(contentPath, \'{0:0>3}.jpg\'.format(i))
        else:
            imgPath = contentPath + \'{0:0>3}.jpg\'.format(i)
        i += 1
        
        #目标文件存在就跳过下载
        if os.path.isfile(imgPath):
            continue
 
        try:
            downloadRequest = requestSession.get(imgUrl, stream=True)
            with open(imgPath, \'wb\') as f:
                for chunk in downloadRequest.iter_content(chunk_size=1024): 
                    if chunk: # filter out keep-alive new chunks
                        f.write(chunk)
                        f.flush()
        except (KeyboardInterrupt, SystemExit):
            print(\'\\n\\n中断下载,删除未下载完的文件!\')
            if os.path.isfile(imgPath):
                os.remove(imgPath)
            raise ErrorCode(3)
 
    print(\'完毕!\\n\')
 
def parseLIST(lst):
    \'\'\'解析命令行中的-l|--list参数,返回解析后的章节列表\'\'\'
    legalListRE = re.compile(r\'^\\d+([,-]\\d+)*$\')
    if not legalListRE.match(lst):
        raise LISTFormatError(lst + \' 不匹配正则: \' + r\'^\\d+([,-]\\d+)*$\')
 
    #先逗号分割字符串,分割后的字符串再用短横杠分割
    parsedLIST = []
    sublist = lst.split(\',\')
    numRE = re.compile(r\'^\\d+$\')
 
    for sub in sublist:
        if numRE.match(sub):
            if int(sub) > 0: #自动忽略掉数字0
                parsedLIST.append(int(sub))
            else:
                print(\'警告: 参数中包括不存在的章节0,自动忽略\')
        else:
            splitnum = list(map(int, sub.split(\'-\')))
            maxnum = max(splitnum)
            minnum = min(splitnum)       #min-max或max-min都支持
            if minnum == 0:
                minnum = 1               #忽略数字0
                print(\'警告: 参数中包括不存在的章节0,自动忽略\')
            parsedLIST.extend(range(minnum, maxnum+1))
 
    parsedLIST = sorted(set(parsedLIST)) #按照从小到大的顺序排序并去重
    return parsedLIST
 
def main(url, path, lst=None, one_folder=False):
    \'\'\'url: 要爬取的漫画首页。 path: 漫画下载路径。 lst: 要下载的章节列表(-l|--list后面的参数)\'\'\'
    try:
        if not os.path.isdir(path):
           os.makedirs(path)
        id = getId(url)
        comicName,comicIntrd,count,contentList = getContent(id)
        contentNameList = []
        for item in contentList:
            for k in item:
                contentNameList.append(item[k][\'t\'])
        print(\'漫画名: {}\'.format(comicName))
        print(\'简介: {}\'.format(comicIntrd))
        print(\'章节数: {}\'.format(count))
        print(\'章节列表:\')
        try:
            print(\'\\n\'.join(contentNameList))
        except Exception:
            print(\'章节列表包含无法解析的特殊字符\\n\')
            
        forbiddenRE = re.compile(r\'[\\\\/\":*?<>|]\') #windows下文件名非法字符\\ / : * ? \" < > |
        comicName = re.sub(forbiddenRE, \'_\', comicName) #将windows下的非法字符一律替换为_
        comicPath = os.path.join(path, comicName)
        if not os.path.isdir(comicPath):
            os.makedirs(comicPath)
        print()
        
        if not lst:
            contentRange = range(1, len(contentList) + 1)
        else:
            contentRange = parseLIST(lst)
 
        for i in contentRange:
            if i > len(contentList):
                print(\'警告: 章节总数 {} ,\'
                        \'参数中包含过大数值,\'
                        \'自动忽略\'.format(len(contentList)))
                break
 
            contentNameList[i - 1] = re.sub(forbiddenRE, \'_\', contentNameList[i - 1]) #将windows下的非法字符一律替换为_
            contentPath = os.path.join(comicPath, \'第{0:0>4}话-{1}\'.format(i, contentNameList[i - 1]))
 
            try:
                print(\'正在下载第{0:0>4}话: {1}\'.format(i, contentNameList[i -1]))
            except Exception:
                print(\'正在下载第{0:0>4}话: {1}\'.format(i))
 
            if not one_folder:
                if not os.path.isdir(contentPath):
                    os.mkdir(contentPath)
 
            imgList = getImgList(contentList[i - 1], id)
            downloadImg(imgList, contentPath, one_folder)
 
    except ErrorCode as e:
        exit(e.code)
    
if __name__ == \'__main__\':
    defaultPath = os.path.join(os.path.expanduser(\'~\'), \'tencent_comic\')
 
    parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
                                     description=\'*下载腾讯漫画,仅供学习交流,请勿用于非法用途*\\n\'
                                     \'空参运行进入交互式模式运行。\')
    parser.add_argument(\'-u\', \'--url\', help=\'要下载的漫画的首页,可以下载以下类型的url: \\n\'
            \'http://ac.qq.com/Comic/comicInfo/id/511915\\n\'
            \'http://m.ac.qq.com/Comic/comicInfo/id/505430\\n\'
            \'http://pad.ac.qq.com/Comic/comicInfo/id/505430\\n\'
            \'http://ac.qq.com/naruto\')
    parser.add_argument(\'-p\', \'--path\', help=\'漫画下载路径。 默认: {}\'.format(defaultPath), 
                default=defaultPath)
    parser.add_argument(\'-d\', \'--dir\', action=\'store_true\', help=\'将所有图片下载到一个目录(适合腾讯漫画等软件连看使用)\')
    parser.add_argument(\'-l\', \'--list\', help=(\"要下载的漫画章节列表,不指定则下载所有章节。格式范例: \\n\"
                                              \"N - 下载具体某一章节,如-l 1, 下载第1章\\n\"
                                              \'N,N... - 下载某几个不连续的章节,如 \"-l 1,3,5\", 下载1,3,5章\\n\'
                                              \'N-N... - 下载某一段连续的章节,如 \"-l 10-50\", 下载[10,50]章\\n\'
                                              \'杂合型 - 结合上面所有的规则,如 \"-l 1,3,5-7,11-111\"\'))
    args = parser.parse_args()
    url = args.url
    path = args.path
    lst = args.list
    one_folder = args.dir
 
    if lst:
        legalListRE = re.compile(r\'^\\d+([,-]\\d+)*$\')
        if not legalListRE.match(lst):
            print(\'LIST参数不合法,请参考--help键入合法参数!\')
            exit(1)
 
    if not url:
        url = input(\'请输入漫画首页地址: \')
        path = input(\'请输入漫画保存路径(默认: {}): \'.format(defaultPath))
        if not path:
            path = defaultPath
 
    main(url, path, lst, one_folder)

五、下载源码

from PyQt5.QtCore import *
from PyQt5.QtWidgets import *
from PyQt5.QtGui import *
import getComic
import os
import re
import sys
 
class TencentComicDownloader(QWidget):
    def __init__(self, parent=None):
        super(TencentComicDownloader, self).__init__(parent)
 
        nameLabel = QLabel(\"漫画首页:\")
 
        self.nameLine = QLineEdit()
 
        self.analysisButton = QPushButton(\"分析\")
        self.analysisButton.clicked.connect(self.anaysisURL)
        self.nameLine.returnPressed.connect(self.analysisButton.click)
 
        pathLineLabel = QLabel(\"下载路径:\")
        self.pathLine = QLineEdit()
        defaultPath = os.path.join(os.path.expanduser(\'~\'), \'tencent_comic\')
        self.pathLine.setText(defaultPath)
        
        self.browseButton = QPushButton(\"浏览\")
        self.browseButton.clicked.connect(self.getPath)
 
        comicNameLabel = QLabel(\"漫画名: \")
        self.comicNameLabel = QLabel(\"暂无\")
        self.one_folder_checkbox = QCheckBox(\"单目录\")
        
        comicIntroLabel = QLabel(\"简介: \")
        self.comicIntro = QLabel(\"暂无\")
        self.comicIntro.setWordWrap(True)
 
        chapterGroupBox = QGroupBox(\"章节列表:\")
        
        self.chapterListView = QListWidget(chapterGroupBox)
        self.chapterListView.setSelectionMode(QAbstractItemView.ExtendedSelection)
        self.chapterListView.setEnabled(False)
 
        groupBoxLayout = QHBoxLayout(chapterGroupBox)
        groupBoxLayout.addWidget(self.chapterListView)
 
        self.downloadButton = QPushButton(\"下载选中\")
        self.statusLabel = QLabel(\"输入要下载的漫画的首页,然后点分析\")
        self.statusLabel.setWordWrap(True)
 
        self.downloadButton.setEnabled(False)
        self.downloadButton.clicked.connect(self.download)
 
        mainLayout = QGridLayout()
        mainLayout.addWidget(nameLabel, 0, 0)
        mainLayout.addWidget(self.nameLine, 0, 1)
        mainLayout.addWidget(self.analysisButton, 0, 2)
        mainLayout.addWidget(pathLineLabel, 1, 0)
        mainLayout.addWidget(self.pathLine, 1, 1)
        mainLayout.addWidget(self.browseButton, 1, 2)
        mainLayout.addWidget(comicNameLabel, 2, 0)
        mainLayout.addWidget(self.comicNameLabel, 2, 1, 1, 2)
        mainLayout.addWidget(self.one_folder_checkbox, 2, 2)
        mainLayout.addWidget(comicIntroLabel, 3, 0)
        mainLayout.addWidget(self.comicIntro, 3, 1, 1, 2)
        mainLayout.addWidget(chapterGroupBox, 4, 0, 1, 3)
        mainLayout.addWidget(self.downloadButton, 5, 2)
        mainLayout.addWidget(self.statusLabel, 5, 0, 1, 2)
 
        self.setLayout(mainLayout)
        self.setWindowTitle(\"腾讯漫画下载\")
        self.setGeometry(400, 300, 800, 500)
 
    def setStatus(self, status):
        self.statusLabel.setText(status)
 
    def enableWidget(self, enable):
        widgets_list = [
                self.downloadButton,
                self.nameLine,
                self.pathLine,
                self.chapterListView,
                self.analysisButton,
                self.browseButton,
                self.one_folder_checkbox
        ]
        for widget in widgets_list:
            widget.setEnabled(enable)
 
        if enable:
            self.downloadButton.setText(\'下载选中\')
            self.chapterListView.setFocus()
 
    def getPath(self):
        path = str(QFileDialog.getExistingDirectory(self, \"选择下载目录\"))
        if path:
            self.pathLine.setText(path)
 
    def anaysisURL(self):
        url = self.nameLine.text()
 
        self.downloadButton.setEnabled(False)
        self.comicNameLabel.setText(\"暂无\")
        self.comicIntro.setText(\"暂无\")
        self.chapterListView.clear()
        self.chapterListView.setEnabled(False)
 
        try:
            if getComic.isLegelUrl(url):
                self.id = getComic.getId(url)
                self.comicName,self.comicIntrd,self.count,self.contentList = getComic.getContent(self.id)
 
                self.contentNameList = []
                for item in self.contentList:
                    for k in item:
                        self.contentNameList.append(item[k][\'t\'])
                
                self.comicNameLabel.setText(self.comicName)
                self.comicIntro.setText(self.comicIntrd)
                self.chapterListView.setEnabled(True)
                self.downloadButton.setEnabled(True)
                self.chapterListView.setFocus()
                self.statusLabel.setText(\'选择要下载的章节后点击右侧按钮\')
 
                for i in range(len(self.contentNameList)):
                    self.chapterListView.addItem(\'第{0:0>4}话-{1}\'.format(i+1, self.contentNameList[i]))
                    self.chapterListView.item(i).setSelected(True)
 
                self.downloadButton.setEnabled(True)
 
            else:
                self.statusLabel.setText(\'<font color=\"red\">错误的URL格式!请输入正确的漫画首页地址!</font>\')
 
        except getComic.ErrorCode as e:
            if e.code == 2:
                self.statusLabel.setText(\'<font color=\"red\">无法跳转为移动端URL,请进入http://m.ac.qq.com找到该漫画地址</font>\')
 
        except KeyError:
            self.statusLabel.setText(\'<font color=\"red\">不存在的地址</font>\')
 
    def download(self):
        self.downloadButton.setText(\"下载中...\")
        one_folder = self.one_folder_checkbox.isChecked()
 
        self.enableWidget(False)
 
        selectedChapterList = [ item.row() for item in self.chapterListView.selectedIndexes() ]
 
        path = self.pathLine.text()
        comicName = self.comicName
        forbiddenRE = re.compile(r\'[\\\\/\":*?<>|]\') #windows下文件名非法字符\\ / : * ? \" < > |
        comicName = re.sub(forbiddenRE, \'_\', comicName) #将windows下的非法字符一律替换为_
        comicPath = os.path.join(path, comicName)
 
        if not os.path.isdir(comicPath):
            os.makedirs(comicPath)
 
        self.downloadThread = Downloader(selectedChapterList, comicPath, self.contentList, self.contentNameList, self.id, one_folder)
        self.downloadThread.output.connect(self.setStatus)
        self.downloadThread.finished.connect(lambda: self.enableWidget(True))
        self.downloadThread.start()
        
class Downloader(QThread):
    output = pyqtSignal([\'QString\'])
    finished = pyqtSignal()
 
    def __init__(self, selectedChapterList, comicPath, contentList, contentNameList, id, one_folder=False, parent=None):
        super(Downloader, self).__init__(parent)
 
        self.selectedChapterList = selectedChapterList
        self.comicPath = comicPath
        self.contentList = contentList
        self.contentNameList = contentNameList
        self.id = id
        self.one_folder = one_folder
 
    def run(self):
        try:
            for i in self.selectedChapterList:
                outputString = \'正在下载第{0:0>4}话: {1}...\'.format(i+1, self.contentNameList[i])
                print(outputString)
                self.output.emit(outputString)
                forbiddenRE = re.compile(r\'[\\\\/\":*?<>|]\') #windows下文件名非法字符\\ / : * ? \" < > |
                self.contentNameList[i] = re.sub(forbiddenRE, \'_\', self.contentNameList[i])
                contentPath = os.path.join(self.comicPath, \'第{0:0>4}话-{1}\'.format(i+1, self.contentNameList[i]))
                if not self.one_folder:
                    if not os.path.isdir(contentPath):
                        os.mkdir(contentPath)
                imgList = getComic.getImgList(self.contentList[i], self.id)
                getComic.downloadImg(imgList, contentPath, self.one_folder)
                
                self.output.emit(\'完毕!\')
       
        except Exception as e:
            self.output.emit(\'<font color=\"red\">{}</font>\\n\'
                    \'遇到异常!请尝试重新点击下载按钮重试\'.format(e))
            raise
 
        finally:
            self.finished.emit()
 
if __name__ == \'__main__\':
    app = QApplication(sys.argv)
 
    main = TencentComicDownloader()
    main.show()
 
    app.exec_()
© 版权声明
THE END
喜欢就支持一下吧
点赞0 分享
评论 抢沙发

请登录后发表评论

    暂无评论内容