0001 #!/usr/bin/env perl
0002 # SPDX-License-Identifier: GPL-2.0
0003 #
0004 # Clean a text file -- or directory of text files -- of stealth whitespace.
0005 # WARNING: this can be a highly destructive operation. Use with caution.
0006 #
0007
0008 use warnings;
0009 use bytes;
0010 use File::Basename;
0011
0012 # Default options
0013 $max_width = 79;
0014
0015 # Clean up space-tab sequences, either by removing spaces or
0016 # replacing them with tabs.
0017 sub clean_space_tabs($)
0018 {
0019 no bytes; # Tab alignment depends on characters
0020
0021 my($li) = @_;
0022 my($lo) = '';
0023 my $pos = 0;
0024 my $nsp = 0;
0025 my($i, $c);
0026
0027 for ($i = 0; $i < length($li); $i++) {
0028 $c = substr($li, $i, 1);
0029 if ($c eq "\t") {
0030 my $npos = ($pos+$nsp+8) & ~7;
0031 my $ntab = ($npos >> 3) - ($pos >> 3);
0032 $lo .= "\t" x $ntab;
0033 $pos = $npos;
0034 $nsp = 0;
0035 } elsif ($c eq "\n" || $c eq "\r") {
0036 $lo .= " " x $nsp;
0037 $pos += $nsp;
0038 $nsp = 0;
0039 $lo .= $c;
0040 $pos = 0;
0041 } elsif ($c eq " ") {
0042 $nsp++;
0043 } else {
0044 $lo .= " " x $nsp;
0045 $pos += $nsp;
0046 $nsp = 0;
0047 $lo .= $c;
0048 $pos++;
0049 }
0050 }
0051 $lo .= " " x $nsp;
0052 return $lo;
0053 }
0054
0055 # Compute the visual width of a string
0056 sub strwidth($) {
0057 no bytes; # Tab alignment depends on characters
0058
0059 my($li) = @_;
0060 my($c, $i);
0061 my $pos = 0;
0062 my $mlen = 0;
0063
0064 for ($i = 0; $i < length($li); $i++) {
0065 $c = substr($li,$i,1);
0066 if ($c eq "\t") {
0067 $pos = ($pos+8) & ~7;
0068 } elsif ($c eq "\n") {
0069 $mlen = $pos if ($pos > $mlen);
0070 $pos = 0;
0071 } else {
0072 $pos++;
0073 }
0074 }
0075
0076 $mlen = $pos if ($pos > $mlen);
0077 return $mlen;
0078 }
0079
0080 $name = basename($0);
0081
0082 @files = ();
0083
0084 while (defined($a = shift(@ARGV))) {
0085 if ($a =~ /^-/) {
0086 if ($a eq '-width' || $a eq '-w') {
0087 $max_width = shift(@ARGV)+0;
0088 } else {
0089 print STDERR "Usage: $name [-width #] files...\n";
0090 exit 1;
0091 }
0092 } else {
0093 push(@files, $a);
0094 }
0095 }
0096
0097 foreach $f ( @files ) {
0098 print STDERR "$name: $f\n";
0099
0100 if (! -f $f) {
0101 print STDERR "$f: not a file\n";
0102 next;
0103 }
0104
0105 if (!open(FILE, '+<', $f)) {
0106 print STDERR "$name: Cannot open file: $f: $!\n";
0107 next;
0108 }
0109
0110 binmode FILE;
0111
0112 # First, verify that it is not a binary file; consider any file
0113 # with a zero byte to be a binary file. Is there any better, or
0114 # additional, heuristic that should be applied?
0115 $is_binary = 0;
0116
0117 while (read(FILE, $data, 65536) > 0) {
0118 if ($data =~ /\0/) {
0119 $is_binary = 1;
0120 last;
0121 }
0122 }
0123
0124 if ($is_binary) {
0125 print STDERR "$name: $f: binary file\n";
0126 next;
0127 }
0128
0129 seek(FILE, 0, 0);
0130
0131 $in_bytes = 0;
0132 $out_bytes = 0;
0133 $blank_bytes = 0;
0134
0135 @blanks = ();
0136 @lines = ();
0137 $lineno = 0;
0138
0139 while ( defined($line = <FILE>) ) {
0140 $lineno++;
0141 $in_bytes += length($line);
0142 $line =~ s/[ \t\r]*$//; # Remove trailing spaces
0143 $line = clean_space_tabs($line);
0144
0145 if ( $line eq "\n" ) {
0146 push(@blanks, $line);
0147 $blank_bytes += length($line);
0148 } else {
0149 push(@lines, @blanks);
0150 $out_bytes += $blank_bytes;
0151 push(@lines, $line);
0152 $out_bytes += length($line);
0153 @blanks = ();
0154 $blank_bytes = 0;
0155 }
0156
0157 $l_width = strwidth($line);
0158 if ($max_width && $l_width > $max_width) {
0159 print STDERR
0160 "$f:$lineno: line exceeds $max_width characters ($l_width)\n";
0161 }
0162 }
0163
0164 # Any blanks at the end of the file are discarded
0165
0166 if ($in_bytes != $out_bytes) {
0167 # Only write to the file if changed
0168 seek(FILE, 0, 0);
0169 print FILE @lines;
0170
0171 if ( !defined($where = tell(FILE)) ||
0172 !truncate(FILE, $where) ) {
0173 die "$name: Failed to truncate modified file: $f: $!\n";
0174 }
0175 }
0176
0177 close(FILE);
0178 }