资源管理站

python爬虫实战-python3批量抓取并验证快代理免费代理地址

zyglz 8423 0

特别声明:本文为原创,可自由转载、引用,但需署名作者且注明文章出处,如有侵权请联系!

使用python写个爬虫批量抓取并验证快代理免费代理地址

import requests
import threading
from bs4 import BeautifulSoup
from queue import Queue

headers={
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
        }
# 分隔符
Separator='|'
data_queue=Queue()
def get_list(page):
    countNum=0
    All_proxy=[]
    for p in range(1,page+1):
        url='https://www.kuaidaili.com/free/inha/{}/'.format(p)
        print(url)
        r=requests.get(url,headers=headers)
        soup=BeautifulSoup(r.text,'lxml')
        trs = soup.find('table', class_='table').find_all('tr')

        for tr in trs[1:]:
            tds = tr.find_all('td')
            ip=tds[0].text.strip()
            port=tds[1].text.strip()
            anony=tds[2].text.strip()
            protocol=tds[3].text.strip()
            locate=tds[4].text.strip()
            time=tds[6].text.strip()
            proxy_str=ip+Separator+port+Separator+protocol+Separator+anony+Separator+locate+Separator+time
            data_queue.put(proxy_str)
            All_proxy.append(proxy_str)
            print(proxy_str)
            countNum += 1
    text_save('Alldaili_kuai.txt',All_proxy)
    return countNum

def verifyProxyList(items):
    myurl = 'http://www.baidu.com/'
    while 1:
        print('线程 %s:启动;还有%d条数据待处理!'%(threading.current_thread().name,data_queue.qsize()))
        if data_queue.empty():
            break
        ll=data_queue.get()
        line=ll.strip().split(Separator)
        daili='{}://{}:{}'.format(line[2].lower(),line[0],line[1])
        if line[2].lower()=='http':
            proxies={
            'http':daili,
            }
        else:
            proxies={
            'https':daili,
            }
        try:
            r=requests.get(url=myurl,headers=headers,proxies=proxies)
            items.append(ll+Separator+'验证成功')
            print (daili+" 连接成功!")
        except Exception as e:
            # items.append(ll+Separator+'验证失败')
            print (daili+" 连接失败!")
    print('线程 %s:结束;还有%d条数据待处理!'%(threading.current_thread().name,data_queue.qsize()))


def text_save(filename, data):#filename为写入txt文件的路径,data为要写入数据列表.
    file = open(filename,'a')
    for i in range(len(data)):
        s = str(data[i]).replace('[','').replace(']','')#去除[],这两行按数据不同,可以选择
        s = s.replace("'",'').replace(',','') +'\n'   #去除单引号,逗号,每行末尾追加换行符
        file.write(s)
    file.close()
    print("保存文件成功") 

def main():
    page=int(input('请输入你要爬取的页码总数:'))
    if page>10:
        page=10
    # page=10
    countNum=get_list(page)
    print('共采集了%d页,共%d条数据!'%(page,countNum))
    print('下面准备开始逐一验证,请稍后......')
    all_thread = []
    verify_list=[]
    for i in range(1,31):
        t = threading.Thread(target=verifyProxyList,name="线程"+str(i),args=(verify_list,))
        all_thread.append(t)
        t.start()

    for t in all_thread:
        t.join()

    text_save('verified_kuaidaili.txt',verify_list)

    print('验证结束,共有%d条信息有效'%len(verify_list))
    print ("All Done.")

if __name__ == '__main__':
    main()

评论列表 (已有0条评论)

消灭零回复

发表评论 (已有0条评论)

icon_lol.gif2016zhh.gif2016fendou.gif2016lengh.gificon_exclaim.gif2016gg.gif2016yhh.gificon_cry.gif2016bs.gif2016qd.gif2016bz.gificon_eek.gif2016ka.gif2016zhem.gificon_confused.gif2016qq.gif2016db.gif2016jk.gif2016tuu.gif2016zk.gif2016kk.gificon_neutral.gif