jsFlagState = $jsFlags;
$this->htmlOptions = $htmlOptions;
}
/*****************************************************************
* HTML parsers - main parsing function splits up document into
* component parts ('normal' HTML, scripts and styles)
******************************************************************/
function HTMLDocument($input, $insert='', $inject=false, $footer='') {
if (strlen($input)>65536) {
if (version_compare(PHP_VERSION, '5.3.7')<=0) {
ini_set('pcre.backtrack_limit', 1000000);
}
}
#
# Apply parsing that only needs to be done once..
#
# Record the charset
global $charset;
if (!isset($charset)) {
$meta_equiv = preg_match('#(]*http\-equiv\s*=[^>]*>)#is', $input, $tmp, PREG_OFFSET_CAPTURE) ? $tmp[0][0] : null;
if (isset($meta_equiv)) {
$charset = preg_match('#charset\s*=\s*["\']+([^"\'\s>]*)#is', $meta_equiv, $tmp, PREG_OFFSET_CAPTURE) ? $tmp[1][0] : null;
}
}
if (!isset($charset)) {
$meta_charset = preg_match('#]*charset\s*=\s*["\']+([^"\'\s>]*)#is', $input, $tmp, PREG_OFFSET_CAPTURE) ? $tmp[1][0] : null;
if (isset($meta_charset)) {
$charset = $meta_charset;
}
}
# Remove empty script comments
$input = preg_replace('#/\*\s*\*/#s', '', $input);
# Remove conditional comments
$input = preg_replace('#<\!\-\-\[if \!IE\]>\s*\-\->(.*?)<\!\[endif\]\-\->#s','$1',$input);
$input = preg_replace('#<\!\-\-\[if.*?<\!\[endif\]\-\->#s','',$input);
# Prevent websites from calling disableOverride()
$input = preg_replace('#disableOverride#s', 'disabled___disableOverride', $input);
# Remove titles if option is enabled
if ( $this->htmlOptions['stripTitle'] || $this->htmlOptions['encodePage'] ) {
$input = preg_replace('#
#is', '', $input, 1);
$input = preg_replace('#]*name=["\'](title|description|keywords)["\'][^>]*>#is', '', $input, 3);
$input = preg_replace('#]*rel=["\'](icon|shortcut icon)["\'][^>]*>#is', '', $input, 2);
}
# Remove and record a href
$input = preg_replace_callback('#]{1,2048}))(?(1)\\1|)[^>]*>#i', 'html_stripBase', $input, 1);
# Proxy url= values in meta redirects
$input = preg_replace_callback('#content\s*=\s*(["\\\'])?[0-9]+\s*;\s*url=([\\\'"]|&\#39;)?((?(?<=")[^"]+|(?(?<=\\\')[^\\\']+|[^\\\'" >]+)))(?(2)\\2|)(?(1)\\1|)#i', 'html_metaRefresh', $input, 1);
# Process forms
$input = preg_replace_callback('##is', 'html_form', $input);
# Remove scripts blocks (avoids individual processing below)
if ( $this->htmlOptions['stripJS'] ) {
$input = preg_replace('##is', '', $input);
}
#
# Split up the document into its different types and parse them
#
# Build up new document into this var
$new = '';
$offset = 0;
# Find instances of script or style blocks
while ( preg_match('#<(s(?:cript|tyle))[^>]*>#i', $input, $match, PREG_OFFSET_CAPTURE, $offset) ) {
# What type of block is this?
$block = strtolower($match[1][0]);
# Start position of content
$outerStart = $match[0][1];
$innerStart = $outerStart + strlen($match[0][0]);
# Determine type of end tag and find it's position
$endTag = "$block>";
$innerEnd = stripos($input, $endTag, $innerStart);
if ($innerEnd===false) {
$endTag = "";
$innerEnd = stripos($input, $endTag, $innerStart);
if ($innerEnd===false) {
$input = preg_replace('##is', $input, $scripts, PREG_OFFSET_CAPTURE) ) {
# Create starting offset - only start encoding after the
# as this seems to help browsers cope!
$offset = preg_match('#]*>(.)#is', $input, $tmp, PREG_OFFSET_CAPTURE) ? $tmp[1][1] : 0;
$new = $offset ? substr($input, 0, $offset) : '';
# Go through all the matches
foreach ( $scripts[0] as $id => $match ) {
# Determine position of the preceeding non-script block
$end = $match[1] ? $match[1]-1 : 0;
$start = $offset;
$length = $end - $start;
# Add encoded block to page if there is one
if ($length && $length>0) {
$new .= "\n\n\n\n";
$new .= encodeBlock(substr($input, $start, $length));
$new .= "\n\n\n\n";
}
# Add unencoded script to page
$new .= "\n\n\n\n";
$new .= $match[0];
$new .= "\n\n\n\n";
# Move offset up
$offset = $match[1] + strlen($match[0]);
}
# Add final block
if ( $remainder = substr($input, $offset) ) {
$new .= encodeBlock($remainder);
}
# Update input with new
$input = $new;
} else {
# No scripts is easy - just encode the lot
$input = encodeBlock($input);
}
# Return the encoded page
return $input;
}
# Encode block - applies the actual encoding
# note - intended to obfustate URLs and HTML source code. Does not provide security. Use SSL for actual security.
function encodeBlock($input) {
global $charset;
$new='';
if (isset($charset)) {
$charset=strtolower($charset);
if (function_exists('mb_convert_encoding')) {
$input=mb_convert_encoding($input, 'HTML-ENTITIES', $charset);
}
}
# Return javascript decoder
return '';
}