jsFlagState = $jsFlags; $this->htmlOptions = $htmlOptions; } /***************************************************************** * HTML parsers - main parsing function splits up document into * component parts ('normal' HTML, scripts and styles) ******************************************************************/ function HTMLDocument($input, $insert='', $inject=false, $footer='') { if (strlen($input)>65536) { if (version_compare(PHP_VERSION, '5.3.7')<=0) { ini_set('pcre.backtrack_limit', 1000000); } } # # Apply parsing that only needs to be done once.. # # Record the charset global $charset; if (!isset($charset)) { $meta_equiv = preg_match('#(]*http\-equiv\s*=[^>]*>)#is', $input, $tmp, PREG_OFFSET_CAPTURE) ? $tmp[0][0] : null; if (isset($meta_equiv)) { $charset = preg_match('#charset\s*=\s*["\']+([^"\'\s>]*)#is', $meta_equiv, $tmp, PREG_OFFSET_CAPTURE) ? $tmp[1][0] : null; } } if (!isset($charset)) { $meta_charset = preg_match('#]*charset\s*=\s*["\']+([^"\'\s>]*)#is', $input, $tmp, PREG_OFFSET_CAPTURE) ? $tmp[1][0] : null; if (isset($meta_charset)) { $charset = $meta_charset; } } # Remove empty script comments $input = preg_replace('#/\*\s*\*/#s', '', $input); # Remove conditional comments $input = preg_replace('#<\!\-\-\[if \!IE\]>\s*\-\->(.*?)<\!\[endif\]\-\->#s','$1',$input); $input = preg_replace('#<\!\-\-\[if.*?<\!\[endif\]\-\->#s','',$input); # Prevent websites from calling disableOverride() $input = preg_replace('#disableOverride#s', 'disabled___disableOverride', $input); # Remove titles if option is enabled if ( $this->htmlOptions['stripTitle'] || $this->htmlOptions['encodePage'] ) { $input = preg_replace('##is', '', $input, 1); $input = preg_replace('#]*name=["\'](title|description|keywords)["\'][^>]*>#is', '', $input, 3); $input = preg_replace('#]*rel=["\'](icon|shortcut icon)["\'][^>]*>#is', '', $input, 2); } # Remove and record a href $input = preg_replace_callback('#]{1,2048}))(?(1)\\1|)[^>]*>#i', 'html_stripBase', $input, 1); # Proxy url= values in meta redirects $input = preg_replace_callback('#content\s*=\s*(["\\\'])?[0-9]+\s*;\s*url=([\\\'"]|&\#39;)?((?(?<=")[^"]+|(?(?<=\\\')[^\\\']+|[^\\\'" >]+)))(?(2)\\2|)(?(1)\\1|)#i', 'html_metaRefresh', $input, 1); # Process forms $input = preg_replace_callback('#]*)>(.*?)#is', 'html_form', $input); # Remove scripts blocks (avoids individual processing below) if ( $this->htmlOptions['stripJS'] ) { $input = preg_replace('#]*>.*?#is', '', $input); } # # Split up the document into its different types and parse them # # Build up new document into this var $new = ''; $offset = 0; # Find instances of script or style blocks while ( preg_match('#<(s(?:cript|tyle))[^>]*>#i', $input, $match, PREG_OFFSET_CAPTURE, $offset) ) { # What type of block is this? $block = strtolower($match[1][0]); # Start position of content $outerStart = $match[0][1]; $innerStart = $outerStart + strlen($match[0][0]); # Determine type of end tag and find it's position $endTag = ""; $innerEnd = stripos($input, $endTag, $innerStart); if ($innerEnd===false) { $endTag = "]*>.*?$#is', '', $input); break; } } $outerEnd = $innerEnd + strlen($endTag); # Parse everything up till here and add to the new document $new .= $this->HTML(substr($input, $offset, $innerStart - $offset)); # Find parsing function $parseFunction = $block == 'style' ? 'CSS' : 'JS' ; # Add the parsed block $new .= $this->$parseFunction(substr($input, $innerStart, $innerEnd - $innerStart)); # Move offset to new position $offset = $innerEnd; } # And add the final chunk (between last script/style block and end of doc) $new .= $this->HTML(substr($input, $offset)); # Replace input with the updated document $input = $new; global $foundPlugin; if ( $foundPlugin && function_exists('postParse') ) { $input = postParse($input, 'html'); $foundPlugin=false; } # Make URLs relative $input = preg_replace('#=\s*(["\'])?\s*https?://[^"\'>/]*/#i', '=$1/', $input); # Encode the page if ( $this->htmlOptions['encodePage'] ) { $input = encodePage($input); } # # Now add our own code bits # # Insert our mini form after the if ( $insert !== false ) { # Check for a frameset if ( ( $useFrames = stripos($input, ']+src\s*=\s*([\\\'"])?((?(1)(?(?<=")[^"]{1,2048}|[^\\\']{1,2048})|[^\s"\\\'>]{1,2048}))(?(1)\\1|)#i', 'html_flagFrames', $input); } # Attempt to add after body $input = preg_replace('#(]*>)#i', '$1' . $insert, $input, 1, $tmp); # Check it inserted and append (if not a frameset) if ( ! $tmp && ! $useFrames ) { $input = $insert . $input; } } # Insert our javascript library if ( $inject ) { # Generate javascript to insert $inject = injectionJS(); # Add our proxy javascript after $input = preg_replace('#(]*>)#i', '$1' . $inject, $input, 1, $tmp); # If no , just prepend if ( ! $tmp ) { $input = $inject . $input; } } # Add anything to the footer? if ( $footer ) { $input = preg_replace('#(]*>)#i', $footer . '$1', $input, 1, $tmp); # If no , just append the footer if ( ! $tmp ){ $input .= $footer; } } # Return new document return $input; } # Parse HTML sections function HTML($input) { # Removing objects? Follow spec and display inner content of object tags instead. if ( $this->htmlOptions['stripObjects'] ) { # Remove all object tags (including those deprecated but still common) $input = preg_replace('#<(?>object|applet|param|embed)[^>]*>#i', '', $input, -1, $tmp); # Found any? Remove the corresponding end tags if ( $tmp ) { $input = preg_replace('#object|applet|param|embed)>#i', '', $input, $tmp); } } else { # Parse tags $input = preg_replace_callback('#]+value\s*=\s*([\\\'"])?((?(1)(?(?<=")[^"]{1,2048}|[^\\\']{1,2048})|[^\s"\\\'>]{1,2048}))(?(1)\\1|)[^>]*>#i', 'html_paramValue', $input); # To do: proxy object related URLs } # Show content within

' . $add . $input[2] . ''; } # Proxy the action="URL" value in forms function html_formAction($input) { return 'action=' . $input[1] . proxyURL($input[2]) . $input[1]; } # Encode input names function html_inputName($input) { return 'name=' . $input[1] . inputEncode($input[2]) . $input[1]; } # Proxy URL values in attributes function html_attribute($input) { # Is this an iframe? $flag = stripos($input[0], 'iframe') === 1 ? 'frame' : ''; # URL occurred as value of an attribute and should have been htmlspecialchar()ed # We need to do the job of the browser and decode before proxying. return str_replace($input[3], htmlspecialchars(proxyURL(htmlspecialchars_decode($input[3]), $flag)), $input[0]); } # Flag frames in a frameset so only the first one shows the mini-form. # This could be done in the above callback but adds extra processing # when 99% of the time, it won't be needed. function html_flagFrames($input) { static $addFlag; # If it's the first frame, leave it but set the flag var if ( ! isset($addFlag) ) { $addFlag = true; return $input[0]; } # Add the frame flag $newURL = $input[2] . ( strpos($input[2], '?') ? '&f=frame' : 'fframe/'); return str_replace($input[2], $newURL, $input[0]); } /***************************************************************** * CSS callbacks ******************************************************************/ # Proxy CSS url(LOCATION) function css_URL($input) { return 'url(' . proxyURL(trim($input[1])) . ')'; } # Proxy CSS @import "URL" function css_import($input) { return '@import "' . proxyURL($input[1]) . '"'; } # Proxy CSS src= function css_src($input) { return 'src=' . $input[1] . proxyURL($input[2]) . $input[1]; } # Callbacks for use with unique URLs and cached CSS # The acts as a marker for quick and easy processing later # Unique CSS url(LOCATION) function css_URL_unique($input) { return 'url()'; } # Unique CSS @import "URL" function css_import_unique($input) { return '@import ""'; } # Unique CSS src= function css_src_unique($input) { return 'src=' . $input[1] . '' . $input[1]; } /***************************************************************** * Helper functions ******************************************************************/ # Take a string, and check that the next non-whitespace char is the # passed in char (X). Return false if non-whitespace and non-X char is # found. Otherwise, return the position of X. # If $inverse is true, the next non-whitespace char must NOT be in $char # If $pastChar is true, ignore whitespace after finding X and return # the position of the last post-X whitespace char. function str_checknext($input, $char, $offset, $inverse = false, $pastChar = false) { for ( $i = $offset, $length = strlen($input); $i < $length; ++$i ) { # Examine char switch ( $input[$i] ) { # Ignore whitespace case ' ': case "\t": case "\r": case "\n": break; # Found the passed char case $char: # $inverse means we do NOT want this char if ( $inverse ) { return false; } # Move past this to the next non-whitespace? if ( $pastChar ) { ++$i; return $i + strspn($input, " \t\r\n", $i); } # Found desired char, no $pastChar, just return X offset return $i; # Found non-$char non-whitespace default: # This is the desired result if $inverse if ( $inverse ) { return $i; } # No $inverse, found a non-$char, return false return false; } } return false; } # Same as above but go backwards function str_checkprev($input, $char, $offset, $inverse = false) { for ( $i = $offset; $i > 0; --$i ) { # Examine char switch ( $input[$i] ) { # Ignore whitespace case ' ': case "\t": case "\r": case "\n": break; # Found char case $char: return $inverse ? false : $i; # Found non-$char char default: return $inverse ? $i : false; } } return $inverse; } # Analyze javascript and return offset positions. # Default is to find the end of the statement, indicated by: # (1) ; while not in string # (2) newline which, if not there, would create invalid syntax # (3) a closing bracket (object, language construct or function call) for which # no corresponding opening bracket was detected AFTER the passed offset # If (int) $argPos is true, we return an array of the start and end position # for the nth argument, where n = $argPos. The $start position must be just inside # the parenthesis of the function call we're interested in. function analyze_js($input, $start, $argPos = false) { # Add , if looking for an argument position if ( $argPos ) { $currentArg = 1; } # Loop through the input, stopping only at special chars for ( $i = $start, $length = strlen($input), $end = false, $openObjects = $openBrackets = $openArrays = 0; $end === false && $i < $length; ++$i ) { $char = $input[$i]; switch ( $char ) { # Starting string delimiters case '"': case "'": if ( $input[$i-1] == '\\' ) { break; } # Skip straight to end of string # Find the corresponding end delimiter and ensure it's not escaped while ( ( $i = strpos($input, $char, $i+1) ) && $input[$i-1] == '\\' ); # Check for false, in which case we assume the end is the end of the doc if ( $i === false ) { break 2; } break; # End of operation? case ';': $end = $i; break; # New lines case "\n": case "\r": # Newlines are OK if occuring within an open brackets, arrays or objects. if ( $openObjects || $openBrackets || $openArrays || $argPos ) { break; } # Newlines are also OK if followed by an opening function OR concatenation # e.g. someFunc\n(params) or someVar \n + anotherVar # Find next non-whitespace char position $tmp = $i + strspn($input, " \t\r\n", $i+1); # And compare to allowed chars if ( isset($input[$tmp+1]) && ( $input[$tmp+1] == '(' || $input[$tmp+1] == '+' ) ) { $i = $tmp; break; } # Newline not indicated as OK, set the end to here $end = $i; break; # Concatenation case '+': # Our interest in the + operator is it's use in allowing an expression # to span multiple lines. If we come across a +, move past all whitespace, # including newlines (which would otherwise indicate end of expression). $i += strspn($input, " \t\r\n", $i+1); break; # Opening chars (objects, parenthesis and arrays) case '{': ++$openObjects; break; case '(': ++$openBrackets; break; case '[': ++$openArrays; break; # Closing chars - is there a corresponding open char? # Yes = reduce stored count. No = end of statement. case '}': $openObjects ? --$openObjects : $end = $i; break; case ')': $openBrackets ? --$openBrackets : $end = $i; break; case ']': $openArrays ? --$openArrays : $end = $i; break; # Commas - tell us which argument it is case ',': # Ignore commas inside other functions or whatnot if ( $openObjects || $openBrackets || $openArrays ) { break; } # End now if ( $currentArg == $argPos ) { $end = $i; } # Increase the current argument number ++$currentArg; # If we're not after the first arg, start now? if ( $currentArg == $argPos ) { $start = $i+1; } break; } } # End not found? Use end of document if ( $end === false ) { $end = $length; } # Return array of start/end if ( $argPos ) { return array($start, $end); } # Return end return $end; } function analyzeAssign_js($input, $start) { # Loop through the input, stopping only at special chars for ( $i = $start, $length = strlen($input), $end = false, $openObjects = $openBrackets = $openArrays = 0; $end === false && $i < $length; ++$i ) { $char = $input[$i]; switch ( $char ) { # Starting string delimiters case '"': case "'": if ( $input[$i-1] == '\\' ) { break; } # Skip straight to end of string # Find the corresponding end delimiter and ensure it's not escaped while ( ( $i = strpos($input, $char, $i+1) ) && $input[$i-1] == '\\' ); # Check for false, in which case we assume the end is the end of the doc if ( $i === false ) { break 2; } break; # End of operation? case ';': $end = $i; break; # New lines case "\n": case "\r": # Newlines are OK if occuring within an open brackets, arrays or objects. if ( $openObjects || $openBrackets || $openArrays ) { break; } break; # Concatenation case '+': # Our interest in the + operator is it's use in allowing an expression # to span multiple lines. If we come across a +, move past all whitespace, # including newlines (which would otherwise indicate end of expression). $i += strspn($input, " \t\r\n", $i+1); break; # Opening chars (objects, parenthesis and arrays) case '{': ++$openObjects; break; case '(': ++$openBrackets; break; case '[': ++$openArrays; break; # Closing chars - is there a corresponding open char? # Yes = reduce stored count. No = end of statement. case '}': $openObjects ? --$openObjects : $end = $i; break; case ')': $openBrackets ? --$openBrackets : $end = $i; break; case ']': $openArrays ? --$openArrays : $end = $i; break; # Commas - tell us which argument it is case ',': # Ignore commas inside other functions or whatnot if ( $openObjects || $openBrackets || $openArrays ) { break; } # End now $end = $i; break; } } # End not found? Use end of document if ( $end === false ) { $end = $length; } # Return end return $end; } /***************************************************************** * Page encoding functions ******************************************************************/ # Encode page - splits into HTML/script sections and encodes HTML function encodePage($input) { # Look for script blocks # if ( preg_match_all('#<(?:script|style).*?#is', $input, $scripts, PREG_OFFSET_CAPTURE) ) { # not working if ( preg_match_all('##is', $input, $scripts, PREG_OFFSET_CAPTURE) ) { # Create starting offset - only start encoding after the # as this seems to help browsers cope! $offset = preg_match('#]*>(.)#is', $input, $tmp, PREG_OFFSET_CAPTURE) ? $tmp[1][1] : 0; $new = $offset ? substr($input, 0, $offset) : ''; # Go through all the matches foreach ( $scripts[0] as $id => $match ) { # Determine position of the preceeding non-script block $end = $match[1] ? $match[1]-1 : 0; $start = $offset; $length = $end - $start; # Add encoded block to page if there is one if ($length && $length>0) { $new .= "\n\n\n\n"; $new .= encodeBlock(substr($input, $start, $length)); $new .= "\n\n\n\n"; } # Add unencoded script to page $new .= "\n\n\n\n"; $new .= $match[0]; $new .= "\n\n\n\n"; # Move offset up $offset = $match[1] + strlen($match[0]); } # Add final block if ( $remainder = substr($input, $offset) ) { $new .= encodeBlock($remainder); } # Update input with new $input = $new; } else { # No scripts is easy - just encode the lot $input = encodeBlock($input); } # Return the encoded page return $input; } # Encode block - applies the actual encoding # note - intended to obfustate URLs and HTML source code. Does not provide security. Use SSL for actual security. function encodeBlock($input) { global $charset; $new=''; if (isset($charset)) { $charset=strtolower($charset); if (function_exists('mb_convert_encoding')) { $input=mb_convert_encoding($input, 'HTML-ENTITIES', $charset); } } # Return javascript decoder return ''; }