php和python 线程池多线程爬虫的例子

发布时间：2022-02-24 13:57:16 所属栏目：PHP教程来源：互联网

导读：php例子: ?php class Connect extends Worker //worker模式 { public function __construct() { } public function getConnection() { if (!self::$ch) { self::$ch = curl_init(); curl_setopt(self::$ch, CURLOPT_TIMEOUT, 2); curl_setopt(self::$ch, CUR

　　php例子:

　　<?php
　　
　　class Connect extends Worker //worker模式
　　{
　　
　　public function __construct()
　　{
　　
　　}
　　
　　public function getConnection()
　　{
　　if (!self::$ch)
　　{
　　self::$ch = curl_init();
　　curl_setopt(self::$ch, CURLOPT_TIMEOUT, 2);
　　curl_setopt(self::$ch, CURLOPT_RETURNTRANSFER, 1);
　　curl_setopt(self::$ch, CURLOPT_HEADER, 0);
　　curl_setopt(self::$ch, CURLOPT_NOSIGNAL, true);
　　curl_setopt(self::$ch, CURLOPT_USERAGENT, "Firefox");
　　curl_setopt(self::$ch, CURLOPT_FOLLOWLOCATION, 1);
　　}
　　
　　/* do some exception/error stuff here maybe */
　　
　　return self::$ch;
　　}
　　
　　public function closeConnection()
　　{
　　curl_close(self::$ch);
　　}
　　
　　/**
　　* Note that the link is stored statically, which for pthreads, means thread local
　　* */
　　protected static $ch;
　　
　　}
　　
　　class Query extends Threaded
　　{
　　
　　public function __construct($url)
　　{
　　$this->url = $url;
　　}
　　
　　public function run()
　　{
　　$ch = $this->worker->getConnection();
　　curl_setopt($ch, CURLOPT_URL, $this->url);
　　$page = curl_exec($ch);
　　$info = curl_getinfo($ch);
　　$error = curl_error($ch);
　　$this->deal_data($this->url, $page, $info, $error);
　　
　　$this->result = $page;
　　}
　　
　　function deal_data($url, $page, $info, $error)
　　{
　　$parts = explode(".", $url);
　　
　　$id = $parts[1];
　　if ($info['http_code'] != 200)
　　{
　　$this->show_msg($id, $error);
　　} else
　　{
　　$this->show_msg($id, "OK");
　　}
　　}
　　
　　function show_msg($id, $msg)
　　{
　　echo $id."/t$msg/n";
　　}
　　
　　public function getResult()
　　{
　　return $this->result;
　　}
　　
　　protected $url;
　　protected $result;
　　
　　}
　　
　　function check_urls_multi_pthreads()
　　{
　　global $check_urls; //定义抓取的连接
　　$check_urls = array( 'http://xxx.com' => "xx网",);
　　$pool = new Pool(10, "Connect", array()); //建立10个线程池
　　foreach ($check_urls as $url => $name)
　　{
　　$pool->submit(new Query($url));
　　}
　　$pool->shutdown();
　　}
　　
　　check_urls_multi_pthreads();
　　python 多线程

　　def handle(sid)://这个方法内执行爬虫数据处理
　　
　　pass
　　class MyThread(Thread):
　　"""docstring for ClassName"""
　　def __init__(self, sid):
　　Thread.__init__(self)
　　self.sid = sid
　　
　　def run():
　　handle(self.sid)
　　
　　threads = []
　　for i in xrange(1,11):
　　t = MyThread(i)
　　threads.append(t)
　　t.start()
　　
　　for t in threads:
　　t.join()
　　python 线程池爬虫

　　from queue import Queue
　　from threading import Thread, Lock
　　import urllib.parse
　　import socket
　　import re
　　import time
　　
　　seen_urls = set(['/'])
　　lock = Lock()
　　
　　
　　class Fetcher(Thread):
　　    def __init__(self, tasks):
　　        Thread.__init__(self)
　　        self.tasks = tasks
　　        self.daemon = True
　　
　　        self.start()
　　
　　    def run(self):
　　        while True:
　　            url = self.tasks.get()
　　            print(url)
　　            sock = socket.socket()
　　            sock.connect(('localhost', 3000))
　　            get = 'GET {} HTTP/1.0/r/nHost: localhost/r/n/r/n'.format(url)
　　            sock.send(get.encode('ascii'))
　　            response = b''
　　            chunk = sock.recv(4096)
　　            while chunk:
　　                response += chunk
　　                chunk = sock.recv(4096)
　　
　　            links = self.parse_links(url, response)
　　
　　            lock.acquire()
　　            for link in links.difference(seen_urls):
　　                self.tasks.put(link)
　　            seen_urls.update(links)
　　            lock.release()
　　
　　            self.tasks.task_done()
　　
　　    def parse_links(self, fetched_url, response):
　　        if not response:
　　            print('error: {}'.format(fetched_url))
　　            return set()
　　        if not self._is_html(response):
　　            return set()
　　        urls = set(re.findall(r'''(?i)href=["']?([^/s"'<>]+)''',
　　                              self.body(response)))
　　
　　 class ThreadPool:
　　    def __init__(self, num_threads):
　　        self.tasks = Queue()
　　        for _ in range(num_threads):
　　            Fetcher(self.tasks)
　　
　　    def add_task(self, url):
　　        self.tasks.put(url)
　　
　　    def wait_completion(self):
　　        self.tasks.join()
　　
　　if __name__ == '__main__':
　　    start = time.time()
　　    pool = ThreadPool(4)
　　    pool.add_task("/")
　　    pool.wait_completion()
　　    print('{} URLs fetched in {:.1f} seconds'.format(len(seen_urls),time.time() - start))

（编辑：拼字网 - 核心网）

【声明】本站内容均来自网络，其相关言论仅代表作者个人观点，不代表本站立场。若无意侵犯到您的权利，请及时与联系站长删除相关内容!

PHP中最常用的2种设计	PHP微信开发之微信消息
分析一个经典的PHP显示	浅谈PHP中用正则表达式