4 class_exists(
'duzun\\hQuery\\Node',
false) or require_once __DIR__ . DIRECTORY_SEPARATOR . 'Node.php';
13 public static $del_spaces =
false;
14 public static $case_folding =
true;
15 public static $autoclose_tags =
false;
17 public static $_emptyTags = array(
'base',
'meta',
'link',
'hr',
'br',
'basefont',
'param',
'img',
'area',
'input',
'isindex',
'col');
18 public static $_specialTags = array(
'--'=>
'--',
'[CDATA['=>
']]');
19 public static $_unparsedTags = array(
'style',
'script');
20 public static $_index_attribs = array(
'href',
'src');
21 public static $_url_attribs = array(
'href'=>
'href',
'src'=>
'src');
23 protected static $_tagID_first_letter =
'a-zA-Z_';
24 protected static $_tagID_letters =
'a-zA-Z_0-9:\-';
25 protected static $_icharset =
'UTF-8';
36 protected $class_idx ;
40 protected $indexed =
false;
44 public function __get($name) {
45 if($this->_prop && array_key_exists($name, $this->_prop))
return $this->_prop[$name];
48 return $this->strlen();
54 return @$this->_prop[
'baseURL'];
57 return $this->location();
61 $this->_prop[
'charset'] =
62 $c = self::detect_charset($this->
html);
69 public function __set($name, $value) {
71 case 'hostURL':
return false;
79 return $this->location($value);
82 if(isset($value))
return $this->_prop[$name] = $value;
83 $this->__unset($name);
87 public function location($href=NULL) {
88 if(func_num_args() < 1) {
89 return @$this->_prop[
'location'][
'href'];
92 if(!isset($this->_prop[
'baseURI'])) {
95 $this->_prop[
'location'][
'href'] = $href;
102 if(func_num_args() < 1) {
103 $href = @$this->_prop[
'baseURI'];
107 $t = self::get_url_base($href,
true);
108 if(!$t)
return false;
114 $this->_prop[
'hostURL'] = $bh;
115 $this->_prop[
'baseURL'] = $bu;
116 $this->_prop[
'baseURI'] = $href;
122 public function __construct($html, $idx=
true) {
123 if(!is_string($html)) $html = (string)$html;
124 $c = self::detect_charset($html) or $c = NULL;
126 $ic = self::$_icharset;
127 if($c != $ic) $html = self::convert_encoding($html, $ic, $c);
129 $this->_prop[
'charset'] = $c;
130 if(self::$del_spaces) {
131 $html = preg_replace(
'#(>)?\\s+(<)?#',
'$1 $2', $html);
133 $this->tags = self::$_ar_;
135 parent::__construct($this, self::$_ar_);
139 $this->_prop[
'baseURI'] =
140 $this->_prop[
'baseURL'] =
141 $this->_prop[
'hostURL'] = NULL;
143 if($this->
html && $idx) $this->_index_all();
146 public function __toString() {
return $this->html; }
149 public static function get_url_base($url, $array=
false) {
150 if($ub = self::get_url_path($url)) {
152 $q = strpos($up,
'/', strpos($up,
'//')+2);
153 $ub = substr($up, 0, $q+1);
155 return $array && $ub ? array($ub, $up) : $ub;
158 public static function get_url_path($url) {
159 $p = strpos($url,
'//');
160 if($p ===
false || $p && !preg_match(
'|^[a-z]+\:$|', substr($url, 0, $p)))
return false;
161 $q = strrpos($url,
'/');
163 $url = substr($url, 0, $q+1);
171 public function url2abs($url) {
172 if( isset($this->_prop[
'baseURL']) ) {
173 return self::abs_url($url, $this->_prop[
'baseURL']);
198 return preg_match(
'/^[a-zA-Z]+\:\/\//', $path);
209 $ds = array(
'\\'=>1,
'/'=>2);
210 if( isset($ds[substr($path, 0, 1)]) ||
211 substr($path, 1, 1) ==
':' && isset($ds[substr($path, 2, 1)])
215 if(($l=strpos($path,
'://')) && $l < 32)
return $l;
229 if (!self::is_url_path($url)) {
230 $t = is_array($base) ? $base : parse_url($base);
231 if (strncmp($url,
'//', 2) == 0) {
232 if ( !empty($t[
'scheme']) ) {
233 $url = $t[
'scheme'] .
':' . $url;
237 $base = (empty($t[
'scheme']) ?
'//' : $t[
'scheme'] .
'://') .
238 $t[
'host'] . (empty($t[
'port']) ?
'' :
':' . $t[
'port']);
239 if (!empty($t[
'path'])) {
240 $s = dirname($t[
'path'] .
'f');
241 if (DIRECTORY_SEPARATOR !=
'/') {
242 $s = strtr($s, DIRECTORY_SEPARATOR,
'/');
244 if ($s && $s !==
'.' && $s !==
'/' && substr($url, 0, 1) !==
'/') {
245 $base .=
'/' . ltrim($s,
'/');
248 $url = rtrim($base,
'/') .
'/' . ltrim($url,
'/');
252 $p = strpos($url,
':');
253 if (substr($url, $p + 3, 1) ===
'/' && in_array(substr($url, 0, $p), array(
'http',
'https'))) {
254 $url = substr($url, 0, $p + 3) . ltrim(substr($url, $p + 3),
'/');
263 public static function detect_charset($str) {
265 $str = substr($str, 0, $l);
266 $str_ = strtolower($str);
269 $p = strpos($str_,
'<meta', $p);
270 if($p ===
false)
break;
272 $q = strpos($str_,
'>', $p);
273 if($q < $p) $q = strlen($str_);
274 $a = substr($str, $p, $q-$p);
276 $a = self::html_parseAttrStr($a,
true);
277 if(!empty($a[
'charset'])) {
278 return strtoupper($a[
'charset']);
280 if(isset($a[
'http-equiv']) && strtolower($a[
'http-equiv']) ===
'content-type') {
281 if(empty($a[
'content']))
return false;
282 $a = explode(
'charset=', $a[
'content']);
283 return empty($a) || empty($a[1]) ? false : strtoupper(trim($a[1]));
289 public function strlen() {
290 return isset($this->
html) ? strlen($this->
html) : 0;
293 public function substr($start, $length=NULL) {
294 return isset($this->
html)
295 ? substr($this->
html, $start, isset($length) ? $length : strlen($this->
html))
304 foreach($this->attribs as $i => $a) $ar[$i] = self::html_attr2str($a);
305 $inf[
'attribs'] = $ar ;
306 $inf[
'attrs'] = $this->attrs ;
307 $inf[
'idx_attr'] = $this->idx_attr ;
308 $inf[
'tag_idx'] = $this->tag_idx ;
309 $inf[
'attr_idx'] = $this->attr_idx ;
310 $inf[
'class_idx'] = $this->class_idx ;
315 $pb = -1; $pe = PHP_INT_MAX;
317 foreach($this->ids as $b => $e) {
318 if($pb < $b && $b < $pe) {
319 $st[] = array($pb, $pe);
320 list($pb, $pe) = array($b, $e);
322 else while($pe < $b && $st) {
323 list($pb, $pe) = array_pop($st);
325 $nm[$b] = $this->tags[$b];
326 $lev[$b] = count($st);
328 foreach($nm as $b => &$n) {
329 $n = str_repeat(
' -', $lev[$b]) .
' < ' . $n .
' ' . $this->get_attr_byId($b, NULL,
true) .
' >';
331 $nm = implode(
"\n", $nm);
333 unset($lev, $st, $nm);
339 if(!isset($o->l)) $o->l = strlen($o->h);
340 $o->tg = self::$_ar_;
343 $i = strpos($o->h,
'<!--', $i);
344 if($i ===
false)
break;
347 $i = strpos($o->h,
'-->', $i);
348 if($i ===
false) $i = $o->l;
356 private function _index_tags() {
357 $s = $nix = $ix = self::$_ar_;
359 foreach($this->tags as $id => $n) {
361 $ix[$n][$id] = $ids[$id];
363 foreach($ix as $n => $v) {
364 foreach($v as $id => $e) $this->tags[$id] = $n;
365 if(isset($nix[$n]))
continue;
366 $_n = strtolower($n);
367 if(isset($nix[$_n])) {
368 foreach($v as $id => $e) $nix[$_n][$id] = $e;
375 foreach($s as $_n) asort($nix[$_n]);
376 return $this->tag_idx = $nix;
380 private function _index_attribs($attrs) {
381 $this->attr_idx = $this->attrs = $aix = $six = $iix = $iax = self::$_ar_;
383 $ian = self::$_index_attribs;
384 foreach($ian as $atn)
if(!isset($iax[$atn])) $iax[$atn] = self::$_ar_;
385 foreach($attrs as $str => $v) {
386 $a = self::html_parseAttrStr($str,
true,
false);
388 foreach($ian as $atn) {
389 if(isset($a[$atn])) {
391 foreach($v as $e) $iax[$atn][$e] = $a[$atn];
394 $iax[$atn][$v] = $a[$atn];
399 if(empty($a))
continue;
400 $str = self::html_attr2str($a);
402 if(isset($six[$str])) {
404 if(!is_array($iix[$aid])) $iix[$aid] = array($iix[$aid]);
405 if(is_array($v))
foreach($v as $v_) $iix[$aid][] = $v_;
406 else $iix[$aid][] = $v;
415 unset($six, $attrs, $i);
416 foreach($aix as $aid => $a) {
425 $u[$e] = $this->ids[$e];
426 $this->attrs[$e] = $aid;
431 if(!is_array($v)) $this->attrs[$v] = $aid;
432 $this->attr_idx[$aid] = $v;
434 foreach($iax as $atn => $v)
if(!$v) unset($iax[$atn]);
435 $this->idx_attr = $iax;
436 $this->attribs = $aix;
438 return $this->attr_idx;
442 private function _index_classes() {
444 $aix = $this->attr_idx;
445 foreach($this->attribs as $aid => &$a)
if(!empty($a[
'class'])) {
448 $cl = preg_split(
'|\\s+|',trim($cl));
450 foreach($cl as $cl) {
451 if(isset($ix[$cl])) {
452 if(!is_array($ix[$cl])) $ix[$cl] = array($ix[$cl]=>$this->attr_idx[$ix[$cl]]);
453 $ix[$cl][$aid] = $this->attr_idx[$aid];
460 return $this->class_idx = $ix;
463 protected function _index_all() {
464 if($this->indexed)
return $this->tag_idx;
465 $this->indexed =
true;
468 $this->o = $o = new \stdClass;
470 $o->l = strlen($o->h);
472 $o->tg = self::$_ar_;
475 $firstLetterChars = self::str_range(self::$_tagID_first_letter);
476 $tagLettersChars = self::str_range(self::$_tagID_letters);
477 $specialTags = array(
'!'=>1,
'?'=>2);
478 $unparsedTags = array_flip(self::$_unparsedTags);
482 $stack = $a = self::$_ar_;
485 $i = strpos($o->h,
'<', $i);
486 if($i ===
false)
break;
492 if($isCloseTag = $c ===
'/') {
498 if(
false !== strpos($firstLetterChars, $c) ) {
500 $j = strspn($o->h, $tagLettersChars, $i);
501 $n = substr($o->h, $i-1, $j+1);
505 if($utn !== $n || !$isCloseTag) {
510 $i = self::html_findTagClose($o->h, $i);
511 if($i ===
false)
break;
516 $this->tags[$e] = $n;
518 $b += strspn($o->h,
" \n\r\t", $b);
520 $at = trim(substr($o->h, $b, $e-$b));
522 if(!isset($a[$at])) $a[$at] = $e;
523 elseif(!is_array($a[$at])) $a[$at] = array($a[$at], $e);
527 if($o->h[$e-1] !=
'/') {
529 if(isset($unparsedTags[$n])) {
544 $this->ids[$q] = $b-1;
548 elseif(!$isCloseTag) {
550 if(isset($specialTags[$c])) {
552 if(isset($o->tg[$b])) {
559 $i = strpos($o->h,
'>', $i);
560 if($i ===
false)
break;
565 foreach($stack as $n => $st)
if(empty($st)) unset($stack[$n]);
574 $this->_index_tags();
575 $this->_index_attribs($a); unset($a);
576 $this->_index_classes();
580 $this->o = self::$_nl_;
583 if(!empty($this->tag_idx[
'base'])) {
584 foreach($this->tag_idx[
'base'] as $b => $e) {
585 if($a = $this->get_attr_byId($b,
'href',
false)) {
592 return $this->tag_idx;
596 protected function _get_ctx($ctx) {
597 if ( !($ctx instanceof parent) ) {
598 if(is_array($ctx) || is_int($ctx)) {
599 $ctx =
new Context($this, $ctx,
true);
605 return $ctx && count($ctx) ? $ctx : self::$_fl_;
608 protected function _find($name, $class=NULL, $attr=NULL, $ctx=NULL, $rec=
true) {
613 if($attr) $aids = $this->get_aids_byClassAttr($class, $attr,
true);
615 if(!$aids)
return self::$_nl_;
619 if(!$aids)
return self::$_nl_;
622 if(is_string($name) && $name !==
'' && $name !=
'*') {
623 $name = strtolower(trim($name));
624 if(empty($this->tag_idx[$name]))
return self::$_nl_;
629 $ctx = $this->_get_ctx($ctx);
630 if(!$ctx)
throw new \Exception(__CLASS__.
'->'.__FUNCTION__.
': Invalid context!');
635 if($ni && $ctx) $ni = $ctx->_filter_contains($ni);
636 if(!$ni)
return self::$_nl_;
637 if($name) $ni = array_intersect_key($ni, $this->tag_idx[$name]);
641 $ni = $this->tag_idx[$name];
642 if($ni && $ctx) $ni = $ctx->_filter_contains($ni);
645 if($ctx) $ni = $ctx->_sub_ids(
false);
646 else $ni = $this->ids;
650 return $ni ? $ni : self::$_nl_;
661 if(!is_array($cl)) $cl = preg_split(
'|\\s+|',trim($cl));
663 if ( is_string($id) && !is_numeric($id) ) {
664 $id = $this->
find($id);
666 if ( $id instanceof
Node ) {
669 if ( !empty($exc) ) {
670 $id = array_diff_key($id, $exc);
675 foreach($id as $id => $e) {
677 if($c) $ret[$id] = $e;
678 elseif($c ===
false)
return $c;
682 if(!isset($this->attrs[$id]))
return 0;
683 foreach($cl as $cl) {
684 if(!isset($this->class_idx[$cl]))
return self::$_fl_;
685 $cl = $this->class_idx[$cl];
686 $aid = $this->attrs[$id];
687 if( is_array($cl) ? !isset($cl[$aid]) : $cl != $aid )
return 0;
692 protected function _filter($ids, $name=NULL, $class=NULL, $attr=NULL, $ctx=NULL) {
695 if($attr) $aids = $this->get_aids_byClassAttr($class, $attr,
true);
697 if(!$aids)
return self::$_nl_;
701 if(!$aids)
return self::$_nl_;
703 unset($class, $attr);
705 foreach($ids as $b => $e)
if(!isset($this->attrs[$b], $aids[$this->attrs[$b]])) unset($ids[$b]);
706 if(!$ids)
return self::$_nl_;
710 if(is_string($name) && $name !==
'' && $name !=
'*') {
711 $name = strtolower(trim($name));
712 if(empty($this->tag_idx[$name]))
return self::$_nl_;
713 foreach($ids as $b => $e)
if(!isset($this->tag_idx[$name][$b])) unset($ids[$b]);
714 if(!$ids)
return self::$_nl_;
719 $ctx = $this->_get_ctx($ctx);
720 if(!$ctx)
throw new \Exception(__CLASS__.
'->'.__FUNCTION__.
': Invalid context!');
721 $ids = $ctx->_filter_contains($ids);
722 if(!$ids)
return $ids;
734 if(isset($actx) && !$actx)
return $aids;
735 if(is_string($attr)) $attr = self::html_parseAttrStr($attr);
737 foreach($actx as $aid => $a) {
738 if(!isset($this->attribs[$aid]))
continue;
739 $a = $this->attribs[$aid];
741 foreach($attr as $n => $v)
if(!isset($a[$n]) || $a[$n] !== $v) { $good =
false;
break; }
742 if($good) $aids[$aid] = $this->attr_idx[$aid];
745 foreach($this->attribs as $aid => $a) {
747 foreach($attr as $n => $v)
if(!isset($a[$n]) || $a[$n] !== $v) { $good =
false;
break; }
748 if($good) $aids[$aid] = $this->attr_idx[$aid];
750 return $as_keys ? $aids : array_keys($aids);
758 if(isset($actx) && !$actx)
return $aids;
759 if(!is_array($cl)) $cl = preg_split(
'|\\s+|',trim($cl));
760 if(!$cl) $cl = array_keys($this->class_idx);
761 foreach($cl as $cl)
if(isset($this->class_idx[$cl])) {
762 $aid = $this->class_idx[$cl];
764 if(is_array($aid))
foreach($aid as $aid => $cl) $aids[$aid] = $cl;
765 else $aids[$aid] = $this->attr_idx[$aid];
768 if(is_array($aid))
foreach($aid as $aid => $cl)
if(isset($actx[$aids])) $aids[$aid] = $cl;
769 else if(isset($actx[$aids])) $aids[$aid] = $this->attr_idx[$aid];
772 else return self::$_ar_;
773 return $as_keys ? $aids : array_keys($aids);
776 protected function get_aids_byClassAttr($cl, $attr, $as_keys=
false, $actx=NULL) {
778 if(is_string($attr)) $attr = self::html_parseAttrStr($attr);
779 if($attr)
foreach($aids as $aid => $ix) {
780 $a = $this->attribs[$aid];
781 $good = count($a) > 1;
782 if($good)
foreach($attr as $n => $v) {
783 if(!isset($a[$n]) || $a[$n] !== $v) {
788 if(!$good) unset($aids[$aid]);
790 return $as_keys ? $aids : array_keys($aids);
799 if(!$has_keys) $aid = self::array_select($this->attr_idx, $aid);
800 foreach($aid as $aid => $aix) {
801 if(!is_array($aix)) $aix =array($aix=>$this->ids[$aix]);
802 if(empty($ret)) $ret = $aix;
803 else foreach($aix as $id => $e) $ret[$id] = $e;
805 if($sort && $ret) ksort($ret);
809 protected function get_ids_byAttr($attr, $sort=
true) {
811 if(is_string($attr)) $attr = self::html_parseAttrStr($attr);
812 if(!$attr)
return $ret;
814 foreach(self::$_index_attribs as $atn) {
815 if(isset($attr[$atn])) {
816 if(empty($this->idx_attr[$atn]))
return $ret;
817 $sat[$atn] = $attr[$atn];
823 if(!$aids)
return $ret;
824 foreach($aids as $aid => $aix) {
825 if(!is_array($aix)) $aix = array($aix=>$this->ids[$aix]);
826 foreach($aix as $id => $e) {
829 foreach($sat as $n => $v) {
830 if(!isset($this->idx_attr[$n][$id]) || $this->idx_attr[$n][$id] !== $v) {
835 if($good) $ret[$id] = $e;
841 $av = reset($sat); $an = key($sat); unset($sat[$an]);
842 $aix = $this->idx_attr[$an];
843 foreach($aix as $id => $v) {
844 if($v !== $av)
continue;
845 $e = $this->ids[$id];
848 foreach($sat as $n => $v) {
849 if(!isset($this->idx_attr[$n][$id]) || $this->idx_attr[$n][$id] !== $v) {
854 if($good) $ret[$id] = $e;
861 if($sort) ksort($ret);
865 protected function get_ids_byClass($cl, $sort=
true) {
870 protected function get_ids_byClassAttr($cl, $attr, $sort=
true) {
871 $aids = $this->get_aids_byClassAttr($cl, $attr,
true);
875 protected function get_attr_byAid($aid, $to_str=
false) {
878 foreach($aid as $aid) $ret[$aid] = $this->get_attr_byAid($aid, $to_str);
880 if(!isset($this->attribs[$aid]))
return self::$_fl_;
881 $ret = $this->attribs[$aid];
882 if($to_str) $ret = self::html_attr2str($ret);
887 protected function get_attr_byId($id, $attr=NULL, $to_str=
false) {
890 foreach($id as $id => $e) $ret[$id] = $this->get_attr_byId($id, $attr, $to_str);
893 if(!isset($this->ids[$id]))
return self::$_fl_;
894 $bu = isset($this->_prop[
'baseURL']);
896 if(isset($this->idx_attr[$attr])) $ret = @$this->idx_attr[$attr][$id];
897 else $ret = isset($this->attrs[$id], $this->attribs[$ret=$this->attrs[$id]]) ? @$this->attribs[$ret][$attr] : self::$_nl_;
898 if($ret && $bu && isset(self::$_url_attribs[$attr])) {
899 $ret = $this->url2abs($ret);
903 if(isset($this->attrs[$id])) $ret = $this->attribs[$this->attrs[$id]];
904 foreach(self::$_index_attribs as $atn) {
905 if(isset($this->idx_attr[$atn][$id])) $ret[$atn] = $this->idx_attr[$atn][$id];
909 foreach(self::$_url_attribs as $n) {
910 if(isset($ret[$n])) $ret[$n] = $this->url2abs($ret[$n]);
913 if($to_str) $ret = self::html_attr2str($ret);
get_ids_byAid($aid, $sort=true, $has_keys=false)
_info()
This method is for debugging only.
static abs_url($url, $base)
static is_abs_path($path)
baseURI($href=NULL)
get/set baseURI
_index_comments_html($o)
Index comment tags position in source HTML.
static is_url_path($path)
find($sel, $_attr=NULL, $ctx=NULL)
get_aids_byClass($cl, $as_keys=false, $actx=NULL)
get_aids_byAttr($attr, $as_keys=false, $actx=NULL)