#ASCII Strip by Dale Swanson November 15th 2007
#go through files and replace non ASCII chars with ASCII

#!/usr/bin/perl
use strict;
use warnings;

use Cwd;

my $filetype = ".htm"; # this is what the program will match, change it to whatever you want to search for
my $outext = ".new.htm"; #file extension of output file
my @allfiles; #stores all the files in the directory
my $file; #used to access each file in the allfiles array
my @dirs; #array of all the directories
my $dirname; #used to store the directory name of a directory
my $dircount; #count of all the directories, not currently used
my @htmlfiles; #array of all the files that match the file type
my $filename; #used to store the file name of a file matching the file type
my $htmlcount; #count of the matching files, not currently used
my $dir; #the name of the directory being worked with
my $startdir; #name of the directory the program is ran from
my $debug = 0; #debug mode, set to 1 to get lot's of output

my $mode = 0; #mode 0 will leave values < 32, mode 1 will replace everything except \t, \n \r (codes 9, 10, 13)
my @filearray; #will store the contents of the file
my $fileline; #will store each line of the file
my @linearray; #array of each char in fileline string
my $defaultchar; #the default replacement char code, when an unknown char is encountered, note that this is the numeric code, not the char itself (so 88 not X)
my @newchar; #array that stores what each ASCII code should translate to
#note all the values are in decimal (base 10), no hex, oct, or binary.
#newchar stores a number in each index from 1-256, to represent the 8bit ASCII codes.  Whatever value is stored at an index is what char will replace a char with the value of that index.
#example: In ASCII 88 is X, 97 is a.  If the value of $newchar[88] = 97, then all X will be replaced with a.  In formula form $newchar[x] = y, all chars with an ASCII code of x will be replaced by chars with ASCII code of y.

$defaultchar = 126; #the ~ sign is the default char for any thing unknown, you can change it to whatever by just changing this number ($ is 36, ~ is 126, X is 88), any number > 256 will leave it blank
@newchar = (0..256); #sets up our initial size

for ($x = 127; $x<=256; $x++)
{#go through every number >= 127, set it to the default char
	$newchar[$x] = $defaultchar;
}

if (mode)
{#if mode = 1, then we will replace chars < 32 too
	for ($x = 0; $x<=31; $x++)
	{#go through every number <= 31, set it to the default char
		$newchar[$x] = $defaultchar;
	}
	$newchar[9] = 9; # \t
	$newchar[10] = 10; # \n
	$newchar[13] = 13; # \r
}

#below will be the chars you want changed to something besides the default
#$newchar[x] = y, where all chars with an ASCII code of x will be replaced by chars with ASCII code of y
$newchar[130] = 44; # ,
$newchar[132] = 34; # "
$newchar[145] = 39; # '
$newchar[146] = 39; # '
$newchar[147] = 34; # "
$newchar[148] = 34; # "
$newchar[150] = 45; # -
$newchar[151] = 45; # -
$newchar[160] = 32; # Space
$newchar[173] = 45; # -
$newchar[175] = 45; # -

system("CLS") if ($debug);
(print "\n") if ($debug);

$dir = cwd(); #starting directory
$startdir = $dir . "/"; #add the slash since it's not there by default

sub listfiles
{# goes through all the files in the current directory, finds the other directories, and the files matching filetype
	(print "\ndir - $dir") if ($debug);
	opendir THISDIR, $dir or die "Can't open directory: $!";
	@allfiles = grep !/^\.\.?$/, readdir THISDIR; #gets all the files in this directory, strips . and ..
	closedir THISDIR;
	(print "\nALL FILES:\n@allfiles\n") if ($debug);
		foreach $file (@allfiles)
	{# go through each file, check to see if it's a directory or the filetype we want
		(print "\nTest File - $file") if ($debug);
		if (-d $dir . "/" . $file )
		{# if it's a directory
			(print "\nList DIR - $file") if ($debug);
			$dirs[$dircount] = $dir . "/" . $file; #adds the directory's name to the array of directories
			$dircount++;
		}
		if ($file =~ /$filetype/i)
		{# if the file type was found 
			(print "\nList File - $file") if ($debug);
			$htmlfiles[$htmlcount] = $dir . "/" . $file; #adds the file's name to the array of files, uses the full path
			$htmlcount++;
		}
	}
	(print "\n***\nAll Directories:\n@dirs\n") if ($debug);
}
&listfiles; #run the sub for the first time

foreach $dirname (@dirs)
{# go through all the directories, and run the main sub, to find more files
	$dir = $dirname; 
	(print "\ndirname - $dirname, dir - $dir") if ($debug);
	&listfiles;
}

foreach $filename (@htmlfiles)
{# go through all the files and output them to the file
	#$filename =~ s/$startdir//; #strip the file path of the part prior to the starting directory, will give paths like /test/fun/file.html rather than C:/site/test/fun/file.htm
	(print "\nfilecount - $htmlcount") if ($debug);
	(print "\nLoop File - $filename") if ($debug);
	(print "\nOut File - $filename") if ($debug);
	
	$outname = $filename . $outext; #puts the filetype in the output file name, if you are searching for htm, and the output file is output.txt, you'll get output.htm.txt
	print "\nInput - $filename \t Output - $outname";
	open(ofile, ">$outname"); #output file name
	open(ifile, $filename);
	@filearray = <ifile>;
	foreach $fileline (@filearray)
	{#go through each line of the file (which is in @filearray), so we can search it
		@linearray = split(//, $fileline); #split the single line into an array where each char is it's own index (like a C char array)
		foreach $oldchar (@linearray)
		{#go through each char, to see if it will be replaced or not
			($newchar = chr($newchar[ord($oldchar)])) if ($newchar[ord($oldchar)] <= 255); #find ASCII value of old char, find the index in newchar array with that index, and set netchar to the char with ASCII value of whatever value is stored at that index
			(print "\nold $oldchar  new $newchar") if ($debug);
			(print ofile ($newchar)) if ($newchar[ord($oldchar)] <= 255); #records it to the file
		}
	}
}