#ASCII Strip by Dale Swanson November 15th 2007 #go through files and replace non ASCII chars with ASCII #!/usr/bin/perl use strict; use warnings; use Cwd; my $filetype = ".htm"; # this is what the program will match, change it to whatever you want to search for my $outext = ".new.htm"; #file extension of output file my @allfiles; #stores all the files in the directory my $file; #used to access each file in the allfiles array my @dirs; #array of all the directories my $dirname; #used to store the directory name of a directory my $dircount; #count of all the directories, not currently used my @htmlfiles; #array of all the files that match the file type my $filename; #used to store the file name of a file matching the file type my $htmlcount; #count of the matching files, not currently used my $dir; #the name of the directory being worked with my $startdir; #name of the directory the program is ran from my $debug = 0; #debug mode, set to 1 to get lot's of output my $mode = 0; #mode 0 will leave values < 32, mode 1 will replace everything except \t, \n \r (codes 9, 10, 13) my @filearray; #will store the contents of the file my $fileline; #will store each line of the file my @linearray; #array of each char in fileline string my $defaultchar; #the default replacement char code, when an unknown char is encountered, note that this is the numeric code, not the char itself (so 88 not X) my @newchar; #array that stores what each ASCII code should translate to #note all the values are in decimal (base 10), no hex, oct, or binary. #newchar stores a number in each index from 1-256, to represent the 8bit ASCII codes. Whatever value is stored at an index is what char will replace a char with the value of that index. #example: In ASCII 88 is X, 97 is a. If the value of $newchar[88] = 97, then all X will be replaced with a. In formula form $newchar[x] = y, all chars with an ASCII code of x will be replaced by chars with ASCII code of y. $defaultchar = 126; #the ~ sign is the default char for any thing unknown, you can change it to whatever by just changing this number ($ is 36, ~ is 126, X is 88), any number > 256 will leave it blank @newchar = (0..256); #sets up our initial size for ($x = 127; $x<=256; $x++) {#go through every number >= 127, set it to the default char $newchar[$x] = $defaultchar; } if (mode) {#if mode = 1, then we will replace chars < 32 too for ($x = 0; $x<=31; $x++) {#go through every number <= 31, set it to the default char $newchar[$x] = $defaultchar; } $newchar[9] = 9; # \t $newchar[10] = 10; # \n $newchar[13] = 13; # \r } #below will be the chars you want changed to something besides the default #$newchar[x] = y, where all chars with an ASCII code of x will be replaced by chars with ASCII code of y $newchar[130] = 44; # , $newchar[132] = 34; # " $newchar[145] = 39; # ' $newchar[146] = 39; # ' $newchar[147] = 34; # " $newchar[148] = 34; # " $newchar[150] = 45; # - $newchar[151] = 45; # - $newchar[160] = 32; # Space $newchar[173] = 45; # - $newchar[175] = 45; # - system("CLS") if ($debug); (print "\n") if ($debug); $dir = cwd(); #starting directory $startdir = $dir . "/"; #add the slash since it's not there by default sub listfiles {# goes through all the files in the current directory, finds the other directories, and the files matching filetype (print "\ndir - $dir") if ($debug); opendir THISDIR, $dir or die "Can't open directory: $!"; @allfiles = grep !/^\.\.?$/, readdir THISDIR; #gets all the files in this directory, strips . and .. closedir THISDIR; (print "\nALL FILES:\n@allfiles\n") if ($debug); foreach $file (@allfiles) {# go through each file, check to see if it's a directory or the filetype we want (print "\nTest File - $file") if ($debug); if (-d $dir . "/" . $file ) {# if it's a directory (print "\nList DIR - $file") if ($debug); $dirs[$dircount] = $dir . "/" . $file; #adds the directory's name to the array of directories $dircount++; } if ($file =~ /$filetype/i) {# if the file type was found (print "\nList File - $file") if ($debug); $htmlfiles[$htmlcount] = $dir . "/" . $file; #adds the file's name to the array of files, uses the full path $htmlcount++; } } (print "\n***\nAll Directories:\n@dirs\n") if ($debug); } &listfiles; #run the sub for the first time foreach $dirname (@dirs) {# go through all the directories, and run the main sub, to find more files $dir = $dirname; (print "\ndirname - $dirname, dir - $dir") if ($debug); &listfiles; } foreach $filename (@htmlfiles) {# go through all the files and output them to the file #$filename =~ s/$startdir//; #strip the file path of the part prior to the starting directory, will give paths like /test/fun/file.html rather than C:/site/test/fun/file.htm (print "\nfilecount - $htmlcount") if ($debug); (print "\nLoop File - $filename") if ($debug); (print "\nOut File - $filename") if ($debug); $outname = $filename . $outext; #puts the filetype in the output file name, if you are searching for htm, and the output file is output.txt, you'll get output.htm.txt print "\nInput - $filename \t Output - $outname"; open(ofile, ">$outname"); #output file name open(ifile, $filename); @filearray = ; foreach $fileline (@filearray) {#go through each line of the file (which is in @filearray), so we can search it @linearray = split(//, $fileline); #split the single line into an array where each char is it's own index (like a C char array) foreach $oldchar (@linearray) {#go through each char, to see if it will be replaced or not ($newchar = chr($newchar[ord($oldchar)])) if ($newchar[ord($oldchar)] <= 255); #find ASCII value of old char, find the index in newchar array with that index, and set netchar to the char with ASCII value of whatever value is stored at that index (print "\nold $oldchar new $newchar") if ($debug); (print ofile ($newchar)) if ($newchar[ord($oldchar)] <= 255); #records it to the file } } }