• avatar
    疯狂造句中......

    我的博客竟然被发现了?!

    贴4篇自己写的博客爬虫

    发布于 2023-06-28  403 次阅读


    • 本篇建立在前面几篇PYTHON爬虫基础系列报告的基础上,只使用了requests+正则实现了最基本的功能
    • 这四个爬虫爬的是hxd的博客,也都是告知了的.如果爬取其他站点要注意不要违法哦🐶

    baicany

    import re
    from weakref import proxy
    import requests
    import logging
    import time
    
    s = requests.Session()
    logging.captureWarnings(True)
    def find(pattern,input_string):
        pattern = pattern
        input_string=input_string
        match = re.search(pattern, input_string, re.DOTALL)
        if match:
            return match.group(1)
        else:
            return None
    
    def findall(pattern,input_string):
        pattern = pattern
        matches = re.findall(pattern, input_string)
        return matches
    
    data = {'name': 'germey', 'age': '25'}
    
    headers = {
        'Cookie': '',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36'
    }
    #r = s.post('https://baicany.github.io', headers=headers,params=data,verify=False)
    print("目标靶机:https://baicany.github.io")
    print("尝试提取链接中")
    try:
        aim = s.get('https://baicany.github.io')
        urls=findall(r"/20(.*?)/\"",aim.text)
        # urls = extract_strings(aim.text)
        print("获取成功:")
        print(urls)
    except:
        print("连接失败,请检查网络")
        time.sleep(100000)
    if len(urls) == 0:
        print('提取失败,请联系作者')
        time.sleep(100000)
    
    count=1
    for url in urls:
        print("开始提取第%d篇网页"%count)
        fakeblog = s.get('https://baicany.github.io/20'+url)
        title=find(r'<title itemprop="name">(.*?) | baicany</title>',fakeblog.text)
        content=find(r"<div class=\"entry-content\">(.*?)<!-- \.entry-content -->",fakeblog.text)
        f = open(title+".htm",'w', encoding='utf-8')
        f.write(content)
        f.close()
        print("获取第%d篇网页成功,内容:%s"%(count,title))
        count=count+1
    

    liberator

    import re
    from weakref import proxy
    import requests
    import logging
    import time
    
    s = requests.Session()
    logging.captureWarnings(True)
    
    def find(pattern,input_string):
        pattern = pattern
        input_string=input_string
        match = re.search(pattern, input_string, re.DOTALL)
        if match:
            return match.group(1)
        else:
            return None
    
    def findall(pattern,input_string):
        pattern = pattern
        matches = re.findall(pattern, input_string)
        return matches
    
    def replace(pattern,input_string, replacement):
        pattern = pattern
        replaced_string = re.sub(pattern, replacement, input_string, flags=re.DOTALL)
        return replaced_string
    
    data = {'name': 'germey', 'age': '25'}
    proxies = {
      'http': ''
    }
    headers = {
        'Cookie': '',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36'
    }
    #r = s.post('https://yyb20040616.github.io/archives/', headers=headers,params=data,sverify=False)
    print("目标靶机:https://yyb20040616.github.io/archives/")
    print("尝试提取链接中")
    try:
        aim = s.get('https://yyb20040616.github.io/archives/')
        urls=findall(r"/20(.*?)/\"",aim.text)
        # urls = extract_strings(aim.text)
        print("获取成功:")
        print(urls)
    except:
        print("连接失败,请检查网络")
        time.sleep(100000)
    if len(urls) == 0:
        print('提取失败,请联系作者')
        time.sleep(100000)
    
    count=1
    for url in urls:
        print("开始提取第%d篇网页"%count)
        fakeblog = s.get('https://yyb20040616.github.io/20'+url)
        title=find(r'<title>(.*?) | liberator</title>',fakeblog.text)
        if title==None:
                title=find(r'<h2 >(.*?)</h2>',fakeblog.text)
        content=find(r'<div class="post-body" itemprop="articleBody">(.*?)<footer class="post-footer">',fakeblog.text)
        imgs=findall(r'<img src="(.*?)">',content)
        for img in imgs:
            img=r'<img src="https://yyb20040616.github.io'+img+'">'
            content=replace(r'<img src="(.*?)">',content,img)
            if(find(r'<p>(.*?)</p>',content))==None:
                print("内容无效")
                count=count+1
                continue
            f = open(title+".htm",'w', encoding='utf-8')
            f.write(content)
            f.close()
        print("获取第%d篇网页成功,内容:%s"%(count,title))
        count=count+1
    

    luna

    import re
    from weakref import proxy
    import requests
    import logging
    import time
    
    s = requests.Session()
    logging.captureWarnings(True)
    
    def find(pattern,input_string):
        pattern = pattern
        input_string=input_string
        match = re.search(pattern, input_string, re.DOTALL)
        if match:
            return match.group(1)
        else:
            return None
    
    def findall(pattern,input_string):
        pattern = pattern
        matches = re.findall(pattern, input_string)
        return matches
    
    def replace(pattern,input_string, replacement):
        pattern = pattern
        replaced_string = re.sub(pattern, replacement, input_string, flags=re.DOTALL)
        return replaced_string
    
    data = {'name': 'germey', 'age': '25'}
    
    headers = {
        'Cookie': '',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36'
    }
    #r = s.post('https://pr1ncesslun4.github.io/', headers=headers,params=data,sverify=False)
    print("目标靶机:https://pr1ncesslun4.github.io/")
    print("尝试提取链接中")
    try:
        aim = s.get('https://pr1ncesslun4.github.io/')
        urls=findall(r"/20(.*?)/\"",aim.text)
        # urls = extract_strings(aim.text)
        print("获取成功:")
        print(urls)
    except:
        print("连接失败,请检查网络")
        time.sleep(100000)
    if len(urls) == 0:
        print('提取失败,请联系作者')
        time.sleep(100000)
    
    count=1
    for url in urls:
        print("开始提取第%d篇网页"%count)
        fakeblog = s.get('https://pr1ncesslun4.github.io//20'+url)
        title=find(r'<h2 id="(.*?)">',fakeblog.text)
        if title==None:
                title=find(r'<h2 >(.*?)</h2>',fakeblog.text)
        content=find(r'<!-- Post -->(.*?)<!-- Post Comments -->',fakeblog.text)
        content=replace(r'/../images/',content,r'https://pr1ncesslun4.github.io/images/')
        if(find(r'<p>(.*?)</p>',content))==None:
            print("内容无效")
            count=count+1
            continue
        f = open(title+".htm",'w', encoding='utf-8')
        f.write(content)
        f.close()
        print("获取第%d篇网页成功,内容:%s"%(count,title))
        count=count+1
    

    bamboo

    • http://8.130.136.54/
    • 他的ssl这段时间出问题了,域名访问不了所以脚本暂时跑不了,之前的成果还在😋
    import re
    from weakref import proxy
    import requests
    import logging
    import time
    
    s = requests.Session()
    logging.captureWarnings(True)
    def find(pattern,input_string):
        pattern = pattern
        input_string=input_string
        match = re.search(pattern, input_string, re.DOTALL)
        if match:
            return match.group(1)
        else:
            return None
    
    def findall(pattern,input_string):
        pattern = pattern
        matches = re.findall(pattern, input_string)
        return matches
    
    def replace(pattern,input_string, replacement):
        pattern = pattern
        replaced_string = re.sub(pattern, replacement, input_string, flags=re.DOTALL)
        return replaced_string
    
    data = {'name': 'germey', 'age': '25'}
    
    headers = {
        'Cookie': '',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36'
    }
    #r = s.post('https://baicany.github.io', headers=headers,params=data,sverify=False)
    print("目标靶机:https://www.bamboo22.top")
    print("尝试提取链接中")
    try:
        aim = s.get('https://www.bamboo22.top',headers=headers,proxies=proxies)
        urls=findall(r'',aim.text)
        # urls = extract_strings(aim.text)
        print("获取成功:")
        print(urls)
    except:
        print("连接失败,请检查网络")
        time.sleep(100000)
    if len(urls) == 0:
        print('提取失败,请联系作者')
        time.sleep(100000)
    
    count=1
    for url in urls:
        url=replace(r'https://8.130.136.54',url,r'https://www.bamboo22.top')
        print("开始提取第%d篇网页"%count)
        print(url)
        fakeblog = s.get(url)
        title=find(r'(.*?) - Bamboo22的博客',fakeblog.text)
        content=find(r'
    (.*?)'+"
    文末附加内容
    ",fakeblog.text) imgs=findall(r"data-fancybox='post-images' href='(.*?)'>" content=replace(r"

    届ける言葉を今は育ててる
    最后更新于 2024-02-07
    m-avatar