Text/joinAt at master · sderose/Text · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
#!/usr/bin/env perl -w
#
# joinAt: Join matching lines, with the next (or previous) line.
# 2014-08-31f: Written by Steven J. DeRose.
#
use strict;
use Getopt::Long;

our %metadata = (
    'title'        => "joinAt",
    'description'  => "Join matching lines, with the next (or previous) line.",
    'rightsHolder' => "Steven J. DeRose",
    'creator'      => "http://viaf.org/viaf/50334488",
    'type'         => "http://purl.org/dc/dcmitype/Software",
    'language'     => "Perl 5",
    'created'      => "2014-08-31",
    'modified'     => "2020-02-18",
    'publisher'    => "http://github.com/sderose",
    'license'      => "https://creativecommons.org/licenses/by-sa/3.0/"
);
our $VERSION_DATE = $metadata{'modified'};


=pod

=head1 Usage

joinAt [options] [--expr I<regex>] [files]

Remove the line-break at the end of any line matching the given I<regex>
(or I<--previous> for the beginning instead).
The match does not have to occur at the start or end of the line unless
you explicitly include "^" or "$".

By default, the matching text is also kept. To delete it, set I<--no-keep>.

The default for I<--expr> is /=$/, which joins up lines
that were split into shorter pieces
with '=' inserted at the end of each piece. This is a convention in MIME
files, so that is
part of the task of decoding "quoted-printable" mail format
(see L<IETF RFC 2045>).
See I<--decode> to also unescape =xx special characters,
and I<--mime> to skip past a standard MIME header, copying it unchanged,
before starting normal processing.


=head1 Options

(prefix 'no-' to negate where applicable)

=over

=item * B<--decode>

In addition, turn sequences of '=' plus 2 hex digits to the correponding
byte value (UTF-8 may still have to be interpreted after that).
This is mainly useful for MIME headers.

This is done I<before> testing whether a line matches I<--expr>.

=item * B<--expr> I<regex> OR B<-e> I<regex>

Specify a regular expression identifying lines to be joined with an adjacent line.
Matching lines are joined with the following line unless I<--previous>
is set, in which case they are joined with the previous line instead.
Default: C<=$>.

=item * B<--iencoding> I<e>

Assume the input is in encoding I<e>. Output is utf-8 unless you set I<--oencoding>.

=item * B<--keep>

Do not delete the matched text; just the newline. Default: true
(use I<--no-keep> to have the matched text deleted).

=item * B<--mime>

Assume there's a MIME header at the top, and do not start changing things until
after the first blank line (which indicates the end of the MIME header).
I<--decode> does not apply to the MIME header, and with I<--previous> The
line immediately following the MIME header cannot join onto the blank line
separating the header.

=item * B<--oencoding> I<e>

Write output in encoding I<e>. Default: utf-8.

=item * B<--previous>

Join matching lines with the I<preceding> line rather than the next line.

=item * B<--separator> I<s>

Insert I<s> in place of the removed line-breaks. Default: ''.
A typical value might be a single space.

=item * B<--strip>

Remove any leading whitespace from the later of lines being joined.
This would typically be accompanied by I<--separator ' '>, but need not be.

=item * B<--verbose> or B<-v>

Add more messages (repeatable).

=item * B<--version>

Show version info and exit.

=back


=head1 Known Bugs and Limitations

I<--decode> does not complain if it sees '=' followed by non-hex-digits.

After I<--decode>, any UTF-8 is not interpreted; this could be a feature or a bug.

Perhaps should watch for "Content-Transfer-Encoding: quoted-printable" in MIME
header, and use it to control where to apply I<--decode>.

I<--strip> just uses Perl regex '\\s' for whitespace, which may miss cases.

Might be nicer to support separate regexes for the prev and next lines,
including a way to require they match somehow.


=head1 Related commands

C<splitAtMatches> -- inserts newlines before, after, or in place
of regex matches.


=head1 History

=over

=item * 2014-08-31f: Written by Steven J. DeRose.

=item * 2014-10-03: ???

=item * 2020-02-18: New layout. Add I<--previous>, I<--separator>, I<--strip>.
Document a few edge cases.

=item * 2021-06-21: Add previous-line tracking features.

=back


=head1 Rights

Copyright 2014 by Steven J. DeRose. This work is licensed under a Creative Commons
Attribution-Share Alike 3.0 Unported License. For further information on
this license, see L<http://creativecommons.org/licenses/by-sa/3.0>.

For the most recent version, see L<here|"http://www.derose.net/steve/utilities/"> or
L<https://github.com/sderose>.

=cut


###############################################################################
# Options
#
my $decode        = 0;
my $expr          = '=$';
my $keep          = 1;
my $iencoding     = "";
my $mime          = 0;
my $oencoding     = "utf8";
my $previous      = 0;
my $separator     = "";
my $strip         = 0;
my $verbose       = 0;

my %getoptHash = (
    "decode!"                 => \$decode,
    "e|expr=s"                => \$expr,
    "keep!"                   => \$keep,
    "iencoding=s"             => \$iencoding,
    "mime!"                   => \$mime,
    "oencoding=s"             => \$oencoding,
    "previous!"               => \$previous,
    "separator=s"             => \$separator,
    "strip!"                  => \$strip,
    "h|help"                  => sub { system "perldoc $0"; exit; },
    "v|verbose+"              => \$verbose,
    "version"                 => sub {
        die "Version of $VERSION_DATE, by Steven J. DeRose.\n";
    },
);
Getopt::Long::Configure ("ignore_case");
GetOptions(%getoptHash) || die "Bad options.\n";

(length($expr) >= 1) || die "--expr required.";

print "";
binmode(STDOUT, ":encoding($oencoding)");


###############################################################################
# Main
#
my $nRecs = my $nMatches = my $nNoMatches = 0;

# Copy MIME header unchanged if needed
my $inMIMEHeader = ($mime) ? 1:0;
if ($mime) {
    while (my $rec = <>) {
        $nRecs++;
        if ($rec =~ m/^\s*$/) { last; }
        print "$rec\n";
    }
}

my $prevLine = "";  # Keep available for cross-line comparison, etc.
my $prevMatchText = "";
my $pendingStrip = 0;  # line matched, and we need to --strip next one pre-join.
while (my $rec = <>) {
    $nRecs++;
    my $origRec = $rec;
    $rec =~ s/[\r\n]+$//;
    if ($decode) {
        $rec =~ s/=([0-9a-f][0-9a-f])/{ chr(hex($1)); }/gie;
    }
    if ($rec =~ m/$expr/) {
        my $theMatchText = ${^MATCH};
        $nMatches++;
        if (!$keep) { $rec =~ s/$expr//; }
        if ($previous) {
            if ($strip) { $rec =~ s/^\s+//; }
            print $separator . $rec;
        }
        else {
            if ($pendingStrip) {
                $rec =~ s/^\s+//;
                $pendingStrip = 0;
            }
            print "\n" . $rec . $separator;
            if ($strip) { $pendingStrip = 1; }
        }
    }
    else { # Not a matching line
        $nNoMatches++;
        if ($previous) { print "\n"; }
        if ($pendingStrip) { $rec =~ s/^\s+//; }
        print "$rec";
        $pendingStrip = 0;
    }
    $prevLine = $origRec;
    $prevMatchText = ${^MATCH};
}
if ($previous) { print "\n"; }

($verbose) && warn(sprintf(
    "joinAt: %d records, %d matches, %d non-matches\n",
    $nRecs, $nMatches, $nNoMatches));