Here's the latest version with a number of miscellaneous changes. This version is a
bit more efficient and removes more whitespaces.
Sorry the major update I promised to this is taking so long. I have been working on the feature set for many months now and expect it will take several more months before its release. Won't even be under the same name as "HTML Catalog Cleaner" because it will do far more, though I plan to announce it here when ready.
Code:
<?php
# Modify PHP Settings #
#######################
ini_set('zlib.output_compression', 'Off'); // Turn off zlib compression, if On, to prevent Mozilla output problems.
ini_set('max_execution_time', 60*60*4); // Make the maximum execution & input time 4 hours so that the script doesn't time-out.
ini_set('max_input_time', 60*60*4);
ob_implicit_flush(1); // Show the progress in the browser.
?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head lang="en">
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
<title>HTML Catalog Cleaner</title>
</head>
<body>
<?php
# $Id: clean.php,v 1.2.0 2005/10/08 01:32:05 bjoshua Exp $
##############################################################################
## ## HTML Catalog Cleaner ## ##
##############################################################################
## Strips every file in the HTML catalog directory of all excess white ##
## spaces. ##
##############################################################################
## Initially Created: 6/26/2004
# Define the constants.
define('CATALOG_DIR', '/your/catalog/directory/here'); // Set the absolute directory path to your catalog.
define('BAR_LENGTH_REDUCER', 3); // If you have over 1000 HTML files in your catalog, you may wish to set this number higher.
# Initialize variables.
$successes = 0;
$failures = 0;
$filelength['init'] = 0;
$filelength['final'] = 0;
$cnt['tmp'] = 0; // Newline counter
$cnt['tot'] = 0; // Totals counter
$pblr = 0; // Progress bar length reducer variable
# Initialize regular expressions.
$regex = array("/[\t\n\r\f]+/" => '', // Newlines and tabs.
'/\s+/' => ' ', // All excess white space.
'/ * */i' => '', // Excess space after or before non-breaking space.
'/> </' => '><' // Space between HTML tags.
# Add anything else here as you need.
# Ex: 'Search' => 'Replace'
);
$java_saver = '|(<script[^>]*>.*?</script>)|si';
# Pad with 256 bytes for Internet Explorer to show output immediately.
if (strpos($_SERVER['HTTP_USER_AGENT'], 'MSIE') !== false) {
for ($pad = 0; $pad < 256; $pad++)
print("\t");
print("\n");
}
print('Stripping the HTML files of excess spaces...
');
# Open the directory and store the file list.
if (is_dir(CATALOG_DIR)) {
if ($dh = opendir(CATALOG_DIR)) {
# Iterate over file list.
while (($filename = readdir($dh)) !== false) { // Skip non-HTML files.
if (strpos($filename, '.htm') !== false)
$file_list[] = $filename;
}
closedir($dh); // Close the directory.
}
if (empty($file_list))
exit('<h3>No recognizable HTML files could be found in the directory "'.CATALOG_DIR.'"</h3>');
# Perform specific operations on the files.
foreach ($file_list as $file) {
$file_contents = file_get_contents(CATALOG_DIR.DIRECTORY_SEPARATOR.$file);
$filelength['init'] += strlen($file_contents);
# Examine document for javascript code blocks and preserve them for restoration.
if (preg_match_all($java_saver, $file_contents, $got_java, PREG_SET_ORDER)) {
foreach ($got_java as $java_chip) {
if (is_array($java_chip))
$java_scripts[] = $java_chip[1];
}
foreach ($regex as $search=>$replace) // Do each replacement.
$file_contents = preg_replace($search, $replace, $file_contents);
# Reverse the damage to the JavaScripts.
if (preg_match_all($java_saver, $file_contents, $got_java, PREG_SET_ORDER)) {
foreach ($got_java as $stripped_java) {
if (is_array($stripped_java)) {
# Find the stripped java and replace it with the original code.
$file_contents = str_replace($stripped_java[1], current($java_scripts), $file_contents);
next($java_scripts);
}
}
}
} else {
foreach ($regex as $search=>$replace) // Do each replacement.
$file_contents = preg_replace($search, $replace, $file_contents);
}
$fp = fopen(CATALOG_DIR.DIRECTORY_SEPARATOR.$file, 'w'); // Truncate file, then apply the modifications.
if (!fwrite($fp, $file_contents)) {
$failure_list[] = $file; // Log failures.
$failures++;
} else
$successes++;
fclose($fp);
unset($java_scripts);
$java_scripts = array();
$filelength['final'] += strlen($file_contents);
if ($pblr == BAR_LENGTH_REDUCER) { // Progress bar length reducer.
print('|'); // Lengthen the progess bar.
$cnt['tmp']++; // Increment the newline counter.
$pblr = 0; // Reset pblr counter.
} else
$pblr++;
$cnt['tot']++; // Increment totals counter.
if ($cnt['tmp'] == 250) {
print('
');
$cnt['tmp']=0; // Reset the counter.
}
}
} else
exit('
'.CATALOG_DIR.' is not a directory! Please check the path and try again.');
print('
There were '.number_format($successes).' successful cleanings and '.number_format($failures).' failures out of a total of '.number_format($cnt['tot']).' files.</p>'.
'
Your HTML Catalog files had a total combined length of '.number_format($filelength['init']).' characters.'.
'
They now have a total length of '.number_format($filelength['final']).' characters.</p>'.
'That is a total of <u>'.number_format($filelength['init']-$filelength['final']).'</u> excess white spaces removed from your files.
'
);
if (!empty($failure_list)) {
print('
The following files could not be written to:
');
$bg = 0;
foreach ($failure_list as $failure) {
# Show background color every other line for readability.
print(($bg % 2 != 0 ? '<font style="background-color:#E0E0E0">' : '').
''.
$failure.
($bg % 2 != 0 ? '</font>' : '').
'
'
);
$bg++;
}
}
?>
</body>
</html>