4 class_exists(
'duzun\\hQuery\\HTML_Parser',
false) or require_once __DIR__ . DIRECTORY_SEPARATOR . 'hQuery' . DIRECTORY_SEPARATOR . 'HTML_Parser.php';
27 public static $cache_path;
28 public static $cache_expires = 3600;
31 public static $_mockup_class;
41 public static function fromHTML($html, $url=NULL) {
42 $index_time = microtime(
true);
43 if ( isset(self::$_mockup_class) ) {
44 $doc =
new self::$_mockup_class($html,
false);
47 $doc =
new self($html,
false);
53 $index_time = microtime(
true) - $index_time;
54 $doc->index_time = $index_time * 1000;
67 public static function fromFile($filename, $use_include_path=
false, $context=NULL) {
68 $read_time = microtime(
true);
69 $html = file_get_contents($filename, $use_include_path, $context);
70 $read_time = microtime(
true) - $read_time;
71 if($html ===
false)
return $html;
72 $doc = self::fromHTML($html, $filename);
73 $doc->source_type =
'file';
74 $doc->read_time = $read_time * 1000;
88 public static function fromURL($url, $headers=NULL, $body=NULL, $options=NULL) {
94 'expires' => self::$cache_expires,
96 $hd = array(
'Accept-Charset' =>
'UTF-8,*');
98 if($options) $opt = $options + $opt;
99 if($headers) $hd = $headers + $hd;
101 $expires = $opt[
'expires'];
102 unset($opt[
'expires']);
104 if(0 < $expires and $dir = self::$cache_path) {
106 $t = realpath($dir) and $dir = $t or mkdir($dir, 0766,
true);
107 $dir .= DIRECTORY_SEPARATOR;
108 $cch_id = hash(
'sha1', $url,
true);
109 $t = hash(
'md5', self::jsonize($opt),
true);
110 $cch_id = bin2hex(substr($cch_id, 0, -strlen($t)) . (substr($cch_id, -strlen($t)) ^ $t));
111 $cch_fn = $dir . $cch_id;
112 $ext = strtolower(strrchr($url,
'.'));
113 if(strlen($ext) < 7 && preg_match(
'/^\\.[a-z0-9]+$/', $ext)) {
117 $read_time = microtime(
true);
118 $ret = self::get_cache($cch_fn, $expires,
false);
119 $read_time = microtime(
true) - $read_time;
121 $source_type =
'cache';
123 $hdrs = $ret[1][
'hdr'];
124 $code = $ret[1][
'code'];
125 $url = $ret[1][
'url'];
127 self::$last_http_result = (object)array(
141 $source_type =
'url';
142 $read_time = microtime(
true);
144 $ret = self::http_wr($url, $hd, $body, $opt);
145 $read_time = microtime(
true) - $read_time;
148 $hdrs = $ret->headers;
151 if($ret->url) $url = $ret->url;
153 if(!empty($cch_fn)) {
154 $save = self::set_cache($cch_fn, $html, array(
'hdr' => $hdrs,
'code' => $code,
'url' => $url));
161 $doc = self::fromHTML($html, $url);
163 $doc->headers = $hdrs;
164 $doc->source_type = $source_type;
165 isset($read_time) and $doc->read_time = $read_time * 1000;
166 if(!empty($cch_meta)) $doc->cch_meta = $cch_meta;
182 public function find($sel, $_attr=NULL, $ctx=NULL) {
184 $c = func_num_args();
185 for($i=1;$i<$c;$i++) {
186 $a = func_get_arg($i);
188 if($a instanceof
hQuery\Node) $ctx = $a;
189 else throw new \Exception(
'Wrong context in ' . __METHOD__);
191 elseif(is_array($a)) $attr = array_merge($attr, $a);
192 elseif(is_string($a)) $attr = array_merge($attr, self::html_parseAttrStr($a));
194 if(isset($ctx)) $ctx = $this->_get_ctx($ctx);
196 $sel = self::html_selector2struc($sel);
200 foreach($sel as $a) {
207 $cx = $this->_get_ctx($rb);
213 if(isset($c[
'i'])) $at[
'id'] = $c[
'i'];
216 $rc = $this->_find($c[
'n'], $c[
'c'], $at, $cx);
220 $ch = $this->_children($rc);
221 $rc = $this->_filter($ch, $c[
'n'], $c[
'c'], $at);
226 foreach($c[
'p'] as $p) {
228 if($p < 0) $p += count($rc);
229 if(count($rc) >= 1 || $p) {
230 $rc = $p < 0 ? NULL : array_slice($rc, $p, 1,
true);
233 elseif(is_array($p)) {
236 case '<': $rc = array_slice($rc, 0, $ch,
true);
break;
237 case '>': $rc = array_slice($rc, $ch, count($rc),
true);
break;
238 case '-': $rc = $this->_prev($rc, $ch);
break;
239 case '+': $rc = $this->_next($rc, $ch);
break;
240 case '|':
do $rc = $this->_parent($rc);
while($ch-- > 0);
break;
241 case '*':
do $rc = $this->_children($rc);
while($ch-- > 0);
break;
251 if($rc)
if(!$ra) $ra = $rc;
else {
foreach($rc as $rb => $rc) $ra[$rb] = $rc; }
255 return new hQuery\Element($this, $ra);
269 public function find_html($sel, $attr=NULL, $ctx=NULL) {
270 $r = $this->find($sel, $attr=NULL, $ctx=NULL);
272 if($r)
foreach($r as $k => $v) $ret[$k] = $v->html();
285 public function find_text($sel, $attr=NULL, $ctx=NULL) {
286 $r = $this->find($sel, $attr=NULL, $ctx=NULL);
288 if($r)
foreach($r as $k => $v) $ret[$k] = $v->text();
295 public function index() {
return $this->_index_all(); }
308 public static function jsonize($data, &$type = NULL, $ops = 0) {
309 if(defined(
'JSON_UNESCAPED_UNICODE')) {
310 $ops |= JSON_UNESCAPED_UNICODE;
312 $str = $ops ? json_encode($data, $ops) : json_encode($data);
313 if( $str ===
false ) {
314 $str = serialize($data);
334 $type = self::serjstype($str);
336 static $_json_support;
337 if ( !isset($_json_support) ) {
340 if ( function_exists(
'json_last_error') ) {
343 if ( function_exists(
'json_last_error_msg') ) {
350 $data = @unserialize($str);
351 if ( $data ===
false ) {
352 if ( strpos($str,
"\n") !==
false ) {
353 if ( $retry = strpos($str,
"\r") ===
false ) {
354 $str = str_replace(
"\n",
"\r\n", $str);
356 elseif ( $retry = strpos($str,
"\r\n") !==
false ) {
357 $str = str_replace(
"\r\n",
"\n", $str);
359 $retry and $data = unserialize($str);
365 $data = json_decode($str,
true);
367 if ( is_null($data) ) {
369 if( $_json_support == 0 ? $str !==
'null' : json_last_error() != JSON_ERROR_NONE ) {
370 $t = preg_replace(
'/,\s*([\]\}])/m',
'$1', $str) and
371 $data = json_decode($t,
true);
373 if( is_null($data) ) {
375 if ( $_json_support ) {
376 if ( json_last_error() != JSON_ERROR_NONE ) {
378 if ( $_json_support > 1 ) {
379 error_log(
'json_decode: ' . json_last_error_msg());
381 elseif( $_json_support > 0 ) {
382 error_log(
"json_decode error with code #".json_last_error());
389 if ( $str !==
'null' ) {
390 error_log(
"json_decode error");
398 $data = json_decode($str,
true);
399 if( is_null($data) && ($_json_support == 0 ? $str !==
'null' : json_last_error() != JSON_ERROR_NONE) ) {
400 $data = unserialize($str);
415 $c = substr($str, 0, 1);
416 if($str ===
'N;' || strpos(
'sibadO', $c) !==
false && substr($str, 1, 1) ===
':') {
420 $l = substr($str, -1);
421 if($c ==
'{' && $l ==
'}' || $c ==
'[' && $l ==
']') {
436 function_exists(
'zlib_decode') and $_gzdecode =
'zlib_decode' or
437 function_exists(
'gzdecode') and $_gzdecode =
'gzdecode' or
447 if ( !isset($_gzdecode) ) {
448 $_gzdecode = self::gz_supported();
451 return $_gzdecode ? $_gzdecode($str) : self::_gzdecode($str);
458 protected static function _gzdecode($gzdata, $maxlen=NULL) {
460 $len = strlen($gzdata);
464 $head = substr($gzdata, 0, 10);
465 $head = unpack(
"n1id/C1cm/C1flg/V1mtime/C1xfl/C1os", $head);
466 list($ID, $CM, $FLG, $MTIME, $XFL, $OS) = array_values($head);
472 $head = unpack(
"V1crc/V1isize", substr($gzdata, $len-8, 8));
473 list($CRC32, $ISIZE) = array_values($head);
475 #-- check gzip stream identifier 477 trigger_error(
"gzdecode: not in gzip format", E_USER_WARNING);
480 #-- check for deflate algorithm 482 trigger_error(
"gzdecode: cannot decode anything but deflated streams", E_USER_WARNING);
485 #-- start of data, skip bonus fields 487 if ($FLG & $FEXTRA) {
491 $s = strpos($gzdata,
"\000", $s) + 1;
493 if ($FLG & $FCOMMENT) {
494 $s = strpos($gzdata,
"\000", $s) + 1;
500 #-- get data, uncompress 501 $gzdata = substr($gzdata, $s, $len-$s);
503 $gzdata = gzinflate($gzdata, $maxlen);
507 $gzdata = gzinflate($gzdata);
511 $chk = crc32($gzdata);
512 if ($CRC32 != $chk) {
513 trigger_error(
"gzdecode: checksum failed (real$chk != comp$CRC32)", E_USER_WARNING);
515 elseif ($ISIZE != strlen($gzdata)) {
516 trigger_error(
"gzdecode: stream size mismatch", E_USER_WARNING);
532 protected static function get_cache($fn, $expire=
false, $meta_only=
false) {
534 if( $fm = @filemtime($fn) and (!$expire || $fm + $expire > time()) ) {
535 $cnt = self::flock_get_contents($fn);
539 if($gz = !strncmp($cnt,
"\x1F\x8B", 2)) {
540 $cnt = self::gzdecode($cnt);
543 $n = (int)substr($cnt, 1, 0x10);
546 $meta = substr($cnt, $l, $n);
547 if($meta !==
'') $meta = self::unjsonize($meta);
549 if($meta_only) $cnt =
'';
552 if($cnt[$l] ==
"\n") {
553 $cnt = substr($cnt, ++$l);
554 if($cnt !==
'') $cnt = self::unjsonize($cnt);
557 $cnt = substr($cnt, $l);
562 if($meta_only) $cnt =
'';
565 return $cnt || $meta ? array($cnt, $meta) :
false;
578 protected static function set_cache($fn, $cnt, $meta=NULL, $gzip=
true) {
579 if($cnt ===
false)
return !file_exists($fn) || unlink($fn);
582 $meta = self::jsonize($meta);
585 $meta =
'#'.$n .
"\n" . $meta;
586 if(!is_string($cnt) || $cnt[0] ==
"\n") { $cnt =
"\n" . self::jsonize($cnt); ++$n; }
587 if($n) $cnt = $meta . $cnt;
589 @mkdir(dirname($fn), 0777,
true);
591 $gl = is_int($gzip) ? $gzip : 1024;
593 strlen($cnt) > $gl && self::gz_supported() and
594 $cnt = gzencode($cnt);
596 return self::flock_put_contents($fn, $cnt);
611 static function do_flock($fp, $lock, $timeout_ms=384) {
612 $l = flock($fp, $lock);
613 if( !$l && ($lock & LOCK_UN) != LOCK_UN ) {
614 $st = microtime(
true);
615 $m = min( 1e3, $timeout_ms*1e3);
616 $n = min(64e3, $timeout_ms*1e3);
617 if($m == $n) $m = ($n >> 1) + 1;
618 $timeout_ms = (float)$timeout_ms / 1000;
621 usleep($t = rand($m, $n));
622 $l = flock($fp, $lock);
623 }
while ( !$l && (microtime(
true)-$st) < $timeout_ms );
628 static function flock_put_contents($fn, $cnt, $block=
false) {
631 if( $f = fopen($fn,
'c+') ) {
632 $app = $block & FILE_APPEND and $block ^= $app;
633 if( $block ? self::do_flock($f, LOCK_EX) : flock($f, LOCK_EX | LOCK_NB) ) {
634 if(is_array($cnt) || is_object($cnt)) $cnt = self::jsonize($cnt);
635 if($app) fseek($f, 0, SEEK_END);
636 if(
false !== ($ret = fwrite($f, $cnt))) {
638 ftruncate($f, ftell($f));
647 static function flock_get_contents($fn, $block=
false) {
650 if( $f = fopen($fn,
'r') ) {
651 if( flock($f, LOCK_SH | ($block ? 0 : LOCK_NB)) ) {
653 do $ret .= $r = fread($f, $s);
while($r !==
false && !feof($f));
654 if($ret == NULL && $r ===
false) $ret = $r;
664 public static function parse_cookie($str) {
666 if ( is_array($str) ) {
667 foreach($str as $k => $v) {
668 $ret[$k] = self::parse_cookie($v);
673 $str = explode(
';', $str);
674 $t = explode(
'=', array_shift($str), 2);
676 $ret[
'value'] = $t[1];
677 foreach ($str as $t) {
678 $t = explode(
'=', trim($t), 2);
679 if ( count($t) == 2 ) {
680 $ret[strtolower($t[0])] = $t[1];
683 $ret[strtolower($t[0])] =
true;
687 if ( !empty($ret[
'expires']) && is_string($ret[
'expires']) ) {
688 $t = strtotime($ret[
'expires']);
689 if ( $t !==
false and $t !== -1 ) {
690 $ret[
'expires'] = $t;
720 public static function http_wr($host, $head = NULL, $body = NULL, $options = NULL) {
721 self::$last_http_result =
722 $ret = new \stdClass;
723 empty($options) and $options = array();
726 if($p = strpos($host,
'://') and $p < 7) {
728 $p = parse_url($host);
730 throw new \Exception(
'Wrong host specified');
734 if(isset($p[
'query'])) {
735 $path .=
'?' . $p[
'query'];
737 if(isset($p[
'port'])) {
740 unset($p[
'path'], $p[
'query']);
745 $p = explode(
'/', $host, 2); list($host, $path) = $p;
746 $p = explode(
':', $host, 2); list($host, $port) = $p;
749 if(strncmp($path,
'/', 1)) {
755 if(isset($options[
'port'])) {
756 $port = $options[
'port'];
759 switch($options[
'scheme']) {
762 case 'https': $port = 443;
break;
763 case 'ftp' : $port = 21;
break;
764 case 'sftp' : $port = 22;
break;
766 default : $port = 80;
774 'host' => isset($options[
'host']) ? $options[
'host'] : $host,
775 'accept' =>
'text/html,application/xhtml+xml,application/xml;q =0.9,*/*;q=0.8',
777 if(!empty($options[
'scheme'])) {
778 switch($p[
'scheme']) {
783 $conhost =
'tls://' . $host;
786 $conhost = $options[
'scheme'] .
'://' . $host;
790 static $boundary =
"\r\n\r\n";
791 $blen = strlen($boundary);
793 if(is_array($body) || is_object($body)) {
794 $body = http_build_query($body);
795 $_h[
'content-type'] =
'application/x-www-form-urlencoded';
797 $body = (string)$body;
798 $_h[
'content-length'] = strlen($body);
800 empty($options[
'method']) and $options[
'method'] =
'POST';
806 !empty($options[
'method']) and $meth = strtoupper($options[
'method']) or $meth =
'GET';
809 if(!is_array($head)) {
810 $head = explode(
"\r\n", $head);
812 foreach($head as $i => $v) {
814 $v = explode(
':', $v, 2);
815 if(count($v) != 2)
continue;
818 $i = strtolower(strtr($i,
' _',
'--'));
823 if(@$options[
'decode'] ==
'gzip') {
825 $_h[
'accept-encoding'] =
'gzip';
832 if(!isset($options[
'close']) || @$options[
'close']) {
833 $_h[
'connection'] =
'close';
836 $_h[
'connection'] =
'keep-alive';
839 $prot = empty($options[
'protocol']) ?
'HTTP/1.1' : $options[
'protocol'];
841 $head = array(
"$meth $path $prot");
842 foreach($_h as $i => $v) {
843 $i = explode(
'-', $i);
844 foreach($i as &$j) $j = ucfirst($j);
845 $i = implode(
'-', $i);
846 $head[] = $i .
': ' . $v;
848 $rqst = implode(
"\r\n", $head) . $boundary . $body;
851 $timeout = isset($options[
'timeout']) ? $options[
'timeout'] : @ini_get(
"default_socket_timeout");
853 $ret->options = $options;
860 $fs = @fsockopen($conhost, $port, $errno, $errstr, $timeout);
862 throw new \Exception(
'unable to create socket "'.$conhost.
':'.$port.
'" '.$errstr, $errno);
864 if(!fwrite($fs, $rqst)) {
865 throw new \Exception(
"unable to write");
870 while($open = !feof($fs) && ($p = @fgets($fs, 1024))) {
871 if($p ==
"\r\n")
break;
876 $h = explode(
"\r\n", rtrim($rsps));
877 list($rprot, $rcode, $rmsg) = explode(
' ', array_shift($h), 3);
879 $v = explode(
':', $v, 2);
880 $k = strtoupper(strtr($v[0],
'- ',
'__'));
881 $v = isset($v[1]) ? trim($v[1]) : NULL;
884 if ( isset($_rh[$k]) ) {
886 if ( is_array($_rh[$k]) ) {
890 $_rh[$k] = array($_rh[$k], $v);
899 $_preserve_method =
true;
904 $_preserve_method =
false;
908 if( @$options[
'redirects'] > 0 && $loc = @$_rh[
'LOCATION'] ) {
909 if ( !empty($options[
'host']) ) {
910 $host = $options[
'host'];
912 is_array($loc) and $loc = end($loc);
913 $loc = self::abs_url($loc, compact(
'host',
'port',
'path') + array(
'scheme' => empty($options[
'scheme'])?
'':$options[
'scheme']));
914 unset($_h[
'host'], $options[
'host'], $options[
'port'], $options[
'scheme']);
915 if ( isset($options[
'redirect_method']) ) {
916 $redirect_method = $options[
'redirect_method'];
917 if ( is_string($redirect_method) ) {
918 $options[
'method'] = $redirect_method = strtoupper($redirect_method);
919 $_preserve_method =
true;
920 if ( $redirect_method !=
'POST' && $redirect_method !=
'PUT' && $redirect_method !=
'DELETE' ) {
925 $_preserve_method = (bool)$redirect_method;
928 if ( !$_preserve_method ) {
930 unset($options[
'method']);
932 --$options[
'redirects'];
934 if ( !empty($_rh[
'SET_COOKIE']) && !empty($options[
'use_cookies']) ) {
935 $t = self::parse_cookie((array)$_rh[
'SET_COOKIE']);
940 if ( empty($c[
'expires']) || $c[
'expires'] >= $now ) {
941 $_h[
'cookie'] = (empty($_h[
'cookie']) ?
'' : $_h[
'cookie'] .
'; ') .
942 $c[
'key'] .
'=' . $c[
'value'];
947 return self::http_wr($loc, $_h, $body, $options);
953 if(@!$open || $rcode < 200 || $rcode == 204 || $rcode == 304 || $meth ==
'HEAD') {
956 elseif(isset($_rh[
'TRANSFER_ENCODING']) && strtolower($_rh[
'TRANSFER_ENCODING']) ===
'chunked') {
959 elseif(isset($_rh[
'CONTENT_LENGTH'])) {
960 $bl = (int)$_rh[
'CONTENT_LENGTH'];
971 while($bl > 0 and $open &= !feof($fs) && ($p = @fread($fs, $bl))) {
977 while($open &= !feof($fs) && ($p = @fgets($fs, 1024))) {
978 $_re = explode(
';', rtrim($p));
982 while($bl > 0 and $open &= !feof($fs) && ($p = @fread($fs, $bl))) {
988 if($open &= !feof($fs) && ($p = @fgets($fs, 1024))) {
991 $v = explode(
':', $p, 2);
992 $k = strtoupper(strtr($v[0],
'- ',
'__'));
993 $v = isset($v[1]) ? trim($v[1]) : NULL;
996 if ( isset($_rh[$k]) ) {
998 if ( is_array($_rh[$k]) ) {
1002 $_rh[$k] = array($_rh[$k], $v);
1015 while($open &= !feof($fs) && ($p = @fread($fs, 1024))) {
1022 isset($options[
'decode']) && $options[
'decode'] ==
'gzip' &&
1023 isset($_rh[
'CONTENT_ENCODING']) && $_rh[
'CONTENT_ENCODING'] ==
'gzip' 1025 $r = self::gzdecode($rsps);
1027 unset($_rh[
'CONTENT_ENCODING']);
1031 throw new \Exception(
"Can't gzdecode(response), try ['decode' => false] option");
1034 $ret->code = $rcode;
1036 $ret->headers = isset($_rh) ? $_rh : NULL;
1038 $ret->method = $meth;
1042 $ret->request = $rqst;
static fromFile($filename, $use_include_path=false, $context=NULL)
static get_cache($fn, $expire=false, $meta_only=false)
static http_wr($host, $head=NULL, $body=NULL, $options=NULL)
find_text($sel, $attr=NULL, $ctx=NULL)
static set_cache($fn, $cnt, $meta=NULL, $gzip=true)
static fromHTML($html, $url=NULL)
find($sel, $_attr=NULL, $ctx=NULL)
static unjsonize($str, &$type=NULL)
static fromURL($url, $headers=NULL, $body=NULL, $options=NULL)
static do_flock($fp, $lock, $timeout_ms=384)
static _gzdecode($gzdata, $maxlen=NULL)
static jsonize($data, &$type=NULL, $ops=0)
find_html($sel, $attr=NULL, $ctx=NULL)