python爬虫实战-python3批量抓取并验证快代理免费代理地址

zyglz 2019-09-14 10350 1

特别声明：本文为原创，可自由转载、引用，但需署名作者且注明文章出处，如有侵权请联系！

使用python写个爬虫批量抓取并验证快代理免费代理地址

import requests
import threading
from bs4 import BeautifulSoup
from queue import Queue

headers={
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
        }
# 分隔符
Separator='|'
data_queue=Queue()
def get_list(page):
    countNum=0
    All_proxy=[]
    for p in range(1,page+1):
        url='https://www.kuaidaili.com/free/inha/{}/'.format(p)
        print(url)
        r=requests.get(url,headers=headers)
        soup=BeautifulSoup(r.text,'lxml')
        trs = soup.find('table', class_='table').find_all('tr')

        for tr in trs[1:]:
            tds = tr.find_all('td')
            ip=tds[0].text.strip()
            port=tds[1].text.strip()
            anony=tds[2].text.strip()
            protocol=tds[3].text.strip()
            locate=tds[4].text.strip()
            time=tds[6].text.strip()
            proxy_str=ip+Separator+port+Separator+protocol+Separator+anony+Separator+locate+Separator+time
            data_queue.put(proxy_str)
            All_proxy.append(proxy_str)
            print(proxy_str)
            countNum += 1
    text_save('Alldaili_kuai.txt',All_proxy)
    return countNum

def verifyProxyList(items):
    myurl = 'http://www.baidu.com/'
    while 1:
        print('线程 %s：启动；还有%d条数据待处理！'%(threading.current_thread().name,data_queue.qsize()))
        if data_queue.empty():
            break
        ll=data_queue.get()
        line=ll.strip().split(Separator)
        daili='{}://{}:{}'.format(line[2].lower(),line[0],line[1])
        if line[2].lower()=='http':
            proxies={
            'http':daili,
            }
        else:
            proxies={
            'https':daili,
            }
        try:
            r=requests.get(url=myurl,headers=headers,proxies=proxies)
            items.append(ll+Separator+'验证成功')
            print (daili+" 连接成功！")
        except Exception as e:
            # items.append(ll+Separator+'验证失败')
            print (daili+" 连接失败！")
    print('线程 %s：结束；还有%d条数据待处理！'%(threading.current_thread().name,data_queue.qsize()))


def text_save(filename, data):#filename为写入txt文件的路径，data为要写入数据列表.
    file = open(filename,'a')
    for i in range(len(data)):
        s = str(data[i]).replace('[','').replace(']','')#去除[],这两行按数据不同，可以选择
        s = s.replace("'",'').replace(',','') +'\n'   #去除单引号，逗号，每行末尾追加换行符
        file.write(s)
    file.close()
    print("保存文件成功") 

def main():
    page=int(input('请输入你要爬取的页码总数：'))
    if page>10:
        page=10
    # page=10
    countNum=get_list(page)
    print('共采集了%d页，共%d条数据！'%(page,countNum))
    print('下面准备开始逐一验证，请稍后......')
    all_thread = []
    verify_list=[]
    for i in range(1,31):
        t = threading.Thread(target=verifyProxyList,name="线程"+str(i),args=(verify_list,))
        all_thread.append(t)
        t.start()

    for t in all_thread:
        t.join()

    text_save('verified_kuaidaili.txt',verify_list)

    print('验证结束，共有%d条信息有效'%len(verify_list))
    print ("All Done.")

if __name__ == '__main__':
    main()

python 爬虫代理

本文地址： https://www.zyglz.com/index.php/blog/archives/17.html

文章来源： zyglz