jsFlagState = $jsFlags; $this->htmlOptions = $htmlOptions; } /***************************************************************** * HTML parsers - main parsing function splits up document into * component parts ('normal' HTML, scripts and styles) ******************************************************************/ function HTMLDocument($input, $insert='', $inject=false, $footer='') { if (strlen($input)>65536) { if (version_compare(PHP_VERSION, '5.3.7')<=0) { ini_set('pcre.backtrack_limit', 1000000); } } # # Apply parsing that only needs to be done once.. # # Record the charset global $charset; if (!isset($charset)) { $meta_equiv = preg_match('#(]*http\-equiv\s*=[^>]*>)#is', $input, $tmp, PREG_OFFSET_CAPTURE) ? $tmp[0][0] : null; if (isset($meta_equiv)) { $charset = preg_match('#charset\s*=\s*["\']+([^"\'\s>]*)#is', $meta_equiv, $tmp, PREG_OFFSET_CAPTURE) ? $tmp[1][0] : null; } } if (!isset($charset)) { $meta_charset = preg_match('#]*charset\s*=\s*["\']+([^"\'\s>]*)#is', $input, $tmp, PREG_OFFSET_CAPTURE) ? $tmp[1][0] : null; if (isset($meta_charset)) { $charset = $meta_charset; } } # Remove empty script comments $input = preg_replace('#/\*\s*\*/#s', '', $input); # Remove conditional comments $input = preg_replace('#<\!\-\-\[if \!IE\]>\s*\-\->(.*?)<\!\[endif\]\-\->#s','$1',$input); $input = preg_replace('#<\!\-\-\[if.*?<\!\[endif\]\-\->#s','',$input); # Prevent websites from calling disableOverride() $input = preg_replace('#disableOverride#s', 'disabled___disableOverride', $input); # Remove titles if option is enabled if ( $this->htmlOptions['stripTitle'] || $this->htmlOptions['encodePage'] ) { $input = preg_replace('##is', '', $input, 1); $input = preg_replace('#]*name=["\'](title|description|keywords)["\'][^>]*>#is', '', $input, 3); $input = preg_replace('#]*rel=["\'](icon|shortcut icon)["\'][^>]*>#is', '', $input, 2); } # Remove and record a href $input = preg_replace_callback('#]{1,2048}))(?(1)\\1|)[^>]*>#i', 'html_stripBase', $input, 1); # Proxy url= values in meta redirects $input = preg_replace_callback('#content\s*=\s*(["\\\'])?[0-9]+\s*;\s*url=([\\\'"]|&\#39;)?((?(?<=")[^"]+|(?(?<=\\\')[^\\\']+|[^\\\'" >]+)))(?(2)\\2|)(?(1)\\1|)#i', 'html_metaRefresh', $input, 1); # Process forms $input = preg_replace_callback('#]*)>(.*?)#is', 'html_form', $input); # Remove scripts blocks (avoids individual processing below) if ( $this->htmlOptions['stripJS'] ) { $input = preg_replace('#]*>.*?#is', '', $input); } # # Split up the document into its different types and parse them # # Build up new document into this var $new = ''; $offset = 0; # Find instances of script or style blocks while ( preg_match('#<(s(?:cript|tyle))[^>]*>#i', $input, $match, PREG_OFFSET_CAPTURE, $offset) ) { # What type of block is this? $block = strtolower($match[1][0]); # Start position of content $outerStart = $match[0][1]; $innerStart = $outerStart + strlen($match[0][0]); # Determine type of end tag and find it's position $endTag = ""; $innerEnd = stripos($input, $endTag, $innerStart); if ($innerEnd===false) { $endTag = "]*>.*?$#is', '', $input); break; } } $outerEnd = $innerEnd + strlen($endTag); # Parse everything up till here and add to the new document $new .= $this->HTML(substr($input, $offset, $innerStart - $offset)); # Find parsing function $parseFunction = $block == 'style' ? 'CSS' : 'JS' ; # Add the parsed block $new .= $this->$parseFunction(substr($input, $innerStart, $innerEnd - $innerStart)); # Move offset to new position $offset = $innerEnd; } # And add the final chunk (between last script/style block and end of doc) $new .= $this->HTML(substr($input, $offset)); # Replace input with the updated document $input = $new; global $foundPlugin; if ( $foundPlugin && function_exists('postParse') ) { $input = postParse($input, 'html'); $foundPlugin=false; } # Make URLs relative $input = preg_replace('#=\s*(["\'])?\s*https?://[^"\'>/]*/#i', '=$1/', $input); # Encode the page if ( $this->htmlOptions['encodePage'] ) { $input = encodePage($input); } # # Now add our own code bits # # Insert our mini form after the if ( $insert !== false ) { # Check for a frameset if ( ( $useFrames = stripos($input, ']+src\s*=\s*([\\\'"])?((?(1)(?(?<=")[^"]{1,2048}|[^\\\']{1,2048})|[^\s"\\\'>]{1,2048}))(?(1)\\1|)#i', 'html_flagFrames', $input); } # Attempt to add after body $input = preg_replace('#(]*>)#i', '$1' . $insert, $input, 1, $tmp); # Check it inserted and append (if not a frameset) if ( ! $tmp && ! $useFrames ) { $input = $insert . $input; } } # Insert our javascript library if ( $inject ) { # Generate javascript to insert $inject = injectionJS(); # Add our proxy javascript after $input = preg_replace('#(]*>)#i', '$1' . $inject, $input, 1, $tmp); # If no , just prepend if ( ! $tmp ) { $input = $inject . $input; } } # Add anything to the footer? if ( $footer ) { $input = preg_replace('#(]*>)#i', $footer . '$1', $input, 1, $tmp); # If no , just append the footer if ( ! $tmp ){ $input .= $footer; } } # Return new document return $input; } # Parse HTML sections function HTML($input) { # Removing objects? Follow spec and display inner content of object tags instead. if ( $this->htmlOptions['stripObjects'] ) { # Remove all object tags (including those deprecated but still common) $input = preg_replace('#<(?>object|applet|param|embed)[^>]*>#i', '', $input, -1, $tmp); # Found any? Remove the corresponding end tags if ( $tmp ) { $input = preg_replace('#object|applet|param|embed)>#i', '', $input, $tmp); } } else { # Parse tags $input = preg_replace_callback('#]+value\s*=\s*([\\\'"])?((?(1)(?(?<=")[^"]{1,2048}|[^\\\']{1,2048})|[^\s"\\\'>]{1,2048}))(?(1)\\1|)[^>]*>#i', 'html_paramValue', $input); # To do: proxy object related URLs } # Show content within