Опубликовано

myCURL класс для парсинга

<?php

class myCurl
{

    public $limit_redirect;
    public $proxy;
    public $thread_num;

    public function __construct($proxy = false, $thread_num = 0, $limit_redirect = 4)
    {
        $this->proxy          = $proxy;
        $this->thread_num     = $thread_num;
        $this->limit_redirect = $limit_redirect;
    }

    public function getData($url, $pattern)
    {
        $subject = $this->get_web_page($url);
        $matches = array();
        if (!$subject['content']) {
            return;
        }

        preg_match_all($pattern, $subject['content'], $matches);
        return $matches;
    }
    public function grab_image($url, $saveto)
    {
        $ch = curl_init($url);

        if ($this->proxy) {

            $options[CURLOPT_PROXY]     = $this->proxy[0];
            $options[CURLOPT_PROXYPORT] = $this->proxy[1];
            if (count($this->proxy) == 4) {
                $options[CURLOPT_PROXYUSERPWD] = $this->proxy[2] . ':' . $this->proxy[3];
            }

        }

        $options[CURLOPT_HEADER]=0;
        $options[CURLOPT_RETURNTRANSFER]=1;
        $options[CURLOPT_BINARYTRANSFER]=1;

        $ch = curl_init($url);
        curl_setopt_array($ch, $options);

        $raw = curl_exec($ch);

        if(!$raw)
            return false;
        
        curl_close($ch);
        if (file_exists($saveto)) {
            unlink($saveto);
        }
        $fp = fopen($saveto, 'x');
        fwrite($fp, $raw);
        fclose($fp);

        return true;
    }

    public function get_web_page($url, $post = false)
    {
        $file   = DIR_SAVE . 'config.ini';
        $config = parse_ini_file($file, true, INI_SCANNER_TYPED);
        //
        $options = array();
        foreach ($config['curl'] as $key => $curlopt) {
            $options[constant($key)] = $curlopt;
        }

        $cookie                         = DIR_COOKIES . $this->thread_num . "_cookie.txt";
        $options[CURLOPT_COOKIESESSION] = true;
        $options[CURLOPT_COOKIEJAR]     = $cookie;
        $options[CURLOPT_COOKIEFILE]    = $cookie;

        if ($this->proxy) {

            $options[CURLOPT_PROXY]     = $this->proxy[0];
            $options[CURLOPT_PROXYPORT] = $this->proxy[1];
            if (count($this->proxy) == 4) {
                $options[CURLOPT_PROXYUSERPWD] = $this->proxy[2] . ':' . $this->proxy[3];
            }

        }

        if ($post && is_array($post)) {
            $options[CURLOPT_POST]       = 1;
            $options[CURLOPT_POSTFIELDS] = $post;
        }

        $ch = curl_init($url);
        curl_setopt_array($ch, $options);

        $ret            = array();
        $ret['content'] = curl_exec($ch);

        $ret['errno']    = curl_errno($ch);
        $ret['errmsg']   = curl_error($ch);
        $ret['response'] = curl_getinfo($ch);

        curl_close($ch);
        //unlink($cookie);

        return $ret;
    }

    public function get_redirect_target($url)
    {
        $options                         = array();
        $options[CURLOPT_HEADER]         = 1;
        $options[CURLOPT_NOBODY]         = 1;
        $options[CURLOPT_RETURNTRANSFER] = 1;

        if ($this->proxy) {
            $options[CURLOPT_PROXY]     = $this->proxy[0];
            $options[CURLOPT_PROXYPORT] = $this->proxy[1];
            if (count($this->proxy) == 4) {
                $options[CURLOPT_PROXYUSERPWD] = $this->proxy[2] . ':' . $this->proxy[3];
            }

        }

        $ch = curl_init($url);
        curl_setopt_array($ch, $options);
        $headers = curl_exec($ch);
        curl_close($ch);

        if (preg_match('/^Location: (.+)$/im', $headers, $matches)) {
            $rurl       = trim($matches[1]);
            $components = parse_url($rurl);
            if (isset($components['host'])) {
                return $rurl;
            } else {
                $components = parse_url($url);
                if (isset($components['host'])) {
                    return rtrim($url, '/') . '/' . $rurl;
                }
            }
        }

        return false;
    }

    public function get_final_url($url, $count = 0)
    {

        if ($count > $this->limit_redirect) {
            return $url;
        }
        $count++;

        $patterns = array();

        $quoutes    = "(?:\"|\')";
        $patterns[] = "/meta.*refresh.*URL=.*(http[^'\"]*)/i";
        $patterns[] = "/window\.location.*?\=.*?$quoutes(.*)$quoutes/is";
        $patterns[] = "/window\.location.href*?\=.*?$quoutes(.*)$quoutes/is";
        $patterns[] = "/window\.location\.(?:replace|assign)\($quoutes(.*)$quoutes\)/i";

        $redir = $this->get_redirect_target($url);

        if ($redir) {
            return $this->get_final_url($redir, $count);
        } else {
            $subject = $this->get_web_page($url, false, $this->thread_num);

            $content = $subject['content'];

            foreach ($patterns as $p) {
                if (preg_match($p, $content, $value)) {
                    if (filter_var($value[1], FILTER_VALIDATE_URL) !== false) {
                        return $this->get_final_url($value[1], $count);
                    }

                }
            }

            return $subject['response']['url'];
        }

    }
}

где-то нашел этот крутой класс для парсинга

Добавить комментарий

Ваш e-mail не будет опубликован.