build: i18n-scan.pl: use xgettext to extract message strings
authorJo-Philipp Wich <jo@mein.io>
Tue, 21 Jan 2020 17:39:32 +0000 (18:39 +0100)
committerJo-Philipp Wich <jo@mein.io>
Wed, 29 Jan 2020 07:33:37 +0000 (08:33 +0100)
Using xgettext has a few benefits compared to the previous perl extraction
approach. The xgettext utility is able to properly distinguish commented
from uncommented code and it is able handle concatenated constant
expressions such as `_("Some " + "string")`.

A further benefit is the ability to extract translations with disambiguation
contexts and plural translation calls.

Signed-off-by: Jo-Philipp Wich <jo@mein.io>
(cherry picked from commit 901a0821f66b1623ea04d1930a2e6e336138263a)

build/i18n-scan.pl

index fc516c6a04bc81fb759d8f1e69bd0a0c5fc882e2..ddec094caab48c25c9d7a2affdc2157a44246aad 100755 (executable)
 #!/usr/bin/perl
 
-use utf8;
 use strict;
 use warnings;
-use Text::Balanced qw(extract_tagged gen_delimited_pat);
+use IPC::Open2;
 use POSIX;
 
-POSIX::setlocale(POSIX::LC_ALL, "C");
+$ENV{'LC_ALL'} = 'C';
+POSIX::setlocale(POSIX::LC_ALL, 'C');
 
 @ARGV >= 1 || die "Usage: $0 <source directory>\n";
 
 
-my %stringtable;
+my %keywords = (
+       '.js' => [ '_:1', '_:1,2c', 'N_:2,3', 'N_:2,3,4c' ],
+       '.lua' => [ '_:1', '_:1,2c', 'translate:1', 'translate:1,2c', 'translatef:1', 'N_:2,3', 'N_:2,3,4c', 'ntranslate:2,3', 'ntranslate:2,3,4c' ],
+       '.htm' => [ '_:1', '_:1,2c', 'translate:1', 'translate:1,2c', 'translatef:1', 'N_:2,3', 'N_:2,3,4c', 'ntranslate:2,3', 'ntranslate:2,3,4c' ],
+       '.json' => [ '_:1', '_:1,2c' ]
+);
 
-sub dec_lua_str
-{
-       my $s = shift;
-       my %rep = (
-               'a' => "\x07",
-               'b' => "\x08",
-               'f' => "\x0c",
-               'n' => "\n",
-               'r' => "\r",
-               't' => "\t",
-               'v' => "\x76"
-       );
-
-       $s =~ s!\\(?:([0-9]{1,2})|(.))!
-               $1 ? chr(int($1)) : ($rep{$2} || $2)
-       !segx;
-
-       $s =~ s/[\s\n]+/ /g;
-       $s =~ s/^ //;
-       $s =~ s/ $//;
+sub xgettext($@) {
+       my $path = shift;
+       my @keywords = @_;
+       my ($ext) = $path =~ m!(\.\w+)$!;
+       my @cmd = qw(xgettext --from-code=UTF-8 --no-wrap);
 
-       return $s;
-}
+       if ($ext eq '.htm' || $ext eq '.lua') {
+               push @cmd, '--language=Lua';
+       }
+       elsif ($ext eq '.js' || $ext eq '.json') {
+               push @cmd, '--language=JavaScript';
+       }
 
-sub dec_json_str
-{
-       my $s = shift;
-       my %rep = (
-               '"' => '"',
-               '/' => '/',
-               'b' => "\x08",
-               'f' => "\x0c",
-               'n' => "\n",
-               'r' => "\r",
-               't' => "\t",
-               '\\' => '\\'
-       );
-
-       $s =~ s!\\([\\/"bfnrt]|u([0-9a-fA-F]{4}))!
-               $2 ? chr(hex($2)) : $rep{$1}
-       !egx;
-
-       $s =~ s/[\s\n]+/ /g;
-       $s =~ s/^ //;
-       $s =~ s/ $//;
+       push @cmd, map { "--keyword=$_" } (@{$keywords{$ext}}, @keywords);
+       push @cmd, '-o', '-';
 
-       return $s;
+       return @cmd;
 }
 
-sub dec_tpl_str
-{
+sub whitespace_collapse($) {
        my $s = shift;
-       $s =~ s/-$//;
-       $s =~ s/[\s\n]+/ /g;
+       my %r = ('n' => ' ', 't' => ' ');
+
+       # Translate \t and \n to plain spaces, leave all other escape
+       # sequences alone. Finally replace all consecutive spaces by
+       # single ones and trim leading and trailing space.
+       $s =~ s/\\(.)/$r{$1} || "\\$1"/eg;
+       $s =~ s/ {2,}/ /g;
        $s =~ s/^ //;
        $s =~ s/ $//;
-       $s =~ s/\\/\\\\/g;
+
        return $s;
 }
 
-if( open F, "find @ARGV -type f '(' -name '*.htm' -o -name '*.lua' -o -name '*.js' ')' | sort |" )
-{
-       while( defined( my $file = readline F ) )
-       {
-               chomp $file;
-
-               if( open S, "< $file" )
-               {
-                       binmode S, ':utf8';
+sub postprocess_pot($$) {
+       my ($path, $source) = @_;
+       my (@res, $msgid);
+       my $skip = 1;
 
-                       local $/ = undef;
-                       my $raw = <S>;
-                       close S;
+       $source =~ s/^#: (.+?)\n/join("\n", map { "#: $path:$_" } $1 =~ m!:(\d+)!g) . "\n"/emg;
 
-                       my $text = $raw;
-                       my $line = 1;
+       my @lines = split /\n/, $source;
 
-                       while ($text =~ s/ ^ (.*?) (?:translate|translatef|i18n|_) ([\n\s]*) \( //sgx)
-                       {
-                               my ($prefix, $suffix) = ($1, $2);
-                               my $code;
-                               my $res = "";
-                               my $sub = "";
-
-                               $line += () = $prefix =~ /\n/g;
-
-                               my $position = "$file:$line";
+       # Remove all header lines up to the first location comment
+       while (@lines > 0 && $lines[0] !~ m!^#: !) {
+               shift @lines;
+       }
 
-                               $line += () = $suffix =~ /\n/g;
+       while (@lines > 0) {
+               my $line = shift @lines;
 
-                               while (defined $sub)
-                               {
-                                       undef $sub;
+               # Concat multiline msgids and collapse whitespaces
+               if ($line =~ m!^(msg\w+) "(.*)"$!) {
+                       my $kw = $1;
+                       my $kv = $2;
 
-                                       if ($text =~ /^ ([\n\s]*(?:\.\.[\n\s]*)?) (\[=*\[) /sx)
-                                       {
-                                               my $ws = $1;
-                                               my $stag = quotemeta $2;
-                                               (my $etag = $stag) =~ y/[/]/;
+                       while (@lines > 0 && $lines[0] =~ m!^"(.*)"$!) {
+                               $kv .= ' '. $1;
+                               shift @lines;
+                       }
 
-                                               ($sub, $text) = extract_tagged($text, $stag, $etag, q{\s*(?:\.\.\s*)?});
+                       $kv = whitespace_collapse($kv);
 
-                                               $line += () = $ws =~ /\n/g;
+                       # Filter invalid empty msgids by popping all lines in @res
+                       # leading to this point and skip all subsequent lines in
+                       # @lines belonging to this faulty id.
+                       if ($kw ne 'msgstr' && $kv eq '') {
+                               while (@res > 0 && $res[-1] !~ m!^$!) {
+                                       pop @res;
+                               }
 
-                                               if (defined($sub) && length($sub)) {
-                                                       $line += () = $sub =~ /\n/g;
+                               while (@lines > 0 && $lines[0] =~ m!^(?:msg\w+ )?"(.*)"$!) {
+                                       shift @lines;
+                               }
 
-                                                       $sub =~ s/^$stag//;
-                                                       $sub =~ s/$etag$//;
-                                                       $res .= $sub;
-                                               }
-                                       }
-                                       elsif ($text =~ /^ ([\n\s]*(?:\.\.[\n\s]*)?) (['"]) /sx)
-                                       {
-                                               my $ws = $1;
-                                               my $quote = $2;
-                                               my $re = gen_delimited_pat($quote, '\\');
+                               next;
+                       }
 
-                                               if ($text =~ m/\G\s*(?:\.\.\s*)?($re)/gcs)
-                                               {
-                                                       $sub = $1;
-                                                       $text = substr $text, pos $text;
-                                               }
+                       push @res, sprintf '%s "%s"', $kw, $kv;
+               }
 
-                                               $line += () = $ws =~ /\n/g;
+               # Ignore any flags added by xgettext
+               elsif ($line =~ m!^#, !) {
+                       next;
+               }
 
-                                               if (defined($sub) && length($sub)) {
-                                                       $line += () = $sub =~ /\n/g;
+               # Pass through other lines unmodified
+               else {
+                       push @res, $line;
+               }
+       }
 
-                                                       $sub =~ s/^$quote//;
-                                                       $sub =~ s/$quote$//;
-                                                       $res .= $sub;
-                                               }
-                                       }
-                               }
+       return @res ? join("\n", '', @res, '') : '';
+}
 
-                               if (defined($res))
-                               {
-                                       $res = dec_lua_str($res);
+sub uniq(@) {
+       my %h = map { $_, 1 } @_;
+       return sort keys %h;
+}
 
-                                       if ($res) {
-                                               $stringtable{$res} ||= [ ];
-                                               push @{$stringtable{$res}}, $position;
-                                       }
-                               }
-                       }
+sub preprocess_htm($$) {
+       my ($path, $source) = @_;
+       my $sub = {
+               '=' => '(%s)',
+               '_' => 'translate([==[%s]==])',
+               ':' => 'translate([==[%s]==])',
+               '+' => 'include([==[%s]==)',
+               '#' => '--[==[%s]==]',
+               ''  => '%s'
+       };
+
+       # Translate the .htm source into a valid Lua source using bracket quotes
+       # to avoid the need for complex escaping.
+       $source =~ s|<%-?([=_:+#]?)(.*?)-?%>|sprintf "]==]; $sub->{$1}; [==[", $2|sge;
+
+       # Discover expressions like "lng.translate(...)" or "luci.i18n.translate(...)"
+       # and return them as extra keyword so that xgettext recognizes such expressions
+       # as translate(...) calls.
+       my @extra_function_keywords =
+               map { ("$_:1", "$_:1,2c") }
+               uniq($source =~ m!((?:\w+\.)+translatef?)[ \t\n]*\(!g);
+
+       return ("[==[$source]==]", @extra_function_keywords);
+}
 
+sub preprocess_lua($$) {
+       my ($path, $source) = @_;
 
-                       $text = $raw;
-                       $line = 1;
+       # Discover expressions like "lng.translate(...)" or "luci.i18n.translate(...)"
+       # and return them as extra keyword so that xgettext recognizes such expressions
+       # as translate(...) calls.
+       my @extra_function_keywords =
+               map { ("$_:1", "$_:1,2c") }
+               uniq($source =~ m!((?:\w+\.)+translatef?)[ \t\n]*\(!g);
 
-                       while( $text =~ s/ ^ (.*?) <% -? [:_] /<%/sgx )
-                       {
-                               $line += () = $1 =~ /\n/g;
+       return ($source, @extra_function_keywords);
+}
 
-                               ( my $code, $text ) = extract_tagged($text, '<%', '%>');
+sub preprocess_json($$) {
+       my ($path, $source) = @_;
+       my ($file) = $path =~ m!([^/]+)$!;
 
-                               if( defined $code )
-                               {
-                                       my $position = "$file:$line";
+       $source =~ s/("(?:title)")\s*:\s*("(?:[^"\\]|\\.)*")/$1: _($2)/sg;
 
-                                       $line += () = $code =~ /\n/g;
+       return ($source);
+}
 
-                                       $code = dec_tpl_str(substr $code, 2, length($code) - 4);
 
-                                       $stringtable{$code} ||= [];
-                                       push @{$stringtable{$code}}, $position;
-                               }
-                       }
-               }
-       }
+my ($msguniq_in, $msguniq_out);
+my $msguniq_pid = open2($msguniq_out, $msguniq_in, 'msguniq', '-s');
 
-       close F;
-}
+print $msguniq_in "msgid \"\"\nmsgstr \"Content-Type: text/plain; charset=UTF-8\"\n";
 
-if( open F, "find @ARGV -type f -path '*/menu.d/*.json' | sort |" )
+if (open F, "find @ARGV -type f '(' -name '*.htm' -o -name '*.lua' -o -name '*.js' -o -path '*/menu.d/*.json' ')' |")
 {
-       while( defined( my $file = readline F ) )
+       while (defined( my $file = readline F))
        {
                chomp $file;
 
-               if( open S, "< $file" )
+               if (open S, '<', $file)
                {
-                       binmode S, ':utf8';
-
                        local $/ = undef;
-                       my $raw = <S>;
-                       close S;
+                       my $source = <S>;
+                       my @extra_function_keywords;
 
-                       my $text = $raw;
-                       my $line = 1;
-
-                       while ($text =~ s/ ^ (.*?) "title" ([\n\s]*) : //sgx)
+                       if ($file =~ m!\.htm$!)
                        {
-                               my ($prefix, $suffix) = ($1, $2);
-                               my $code;
-                               my $res = "";
-                               my $sub = "";
-
-                               $line += () = $prefix =~ /\n/g;
-
-                               my $position = "$file:$line";
-
-                               $line += () = $suffix =~ /\n/g;
-
-                               while (defined $sub)
-                               {
-                                       undef $sub;
-
-                                       if ($text =~ /^ ([\n\s]*) " /sx)
-                                       {
-                                               my $ws = $1;
-                                               my $re = gen_delimited_pat('"', '\\');
-
-                                               if ($text =~ m/\G\s*($re)/gcs)
-                                               {
-                                                       $sub = $1;
-                                                       $text = substr $text, pos $text;
-                                               }
+                               ($source, @extra_function_keywords) = preprocess_htm($file, $source);
+                       }
+                       elsif ($file =~ m!\.lua$!)
+                       {
+                               ($source, @extra_function_keywords) = preprocess_lua($file, $source);
+                       }
+                       elsif ($file =~ m!\.json$!)
+                       {
+                               ($source, @extra_function_keywords) = preprocess_json($file, $source);
+                       }
 
-                                               $line += () = $ws =~ /\n/g;
+                       my ($xgettext_in, $xgettext_out);
+                       my $pid = open2($xgettext_out, $xgettext_in, xgettext($file, @extra_function_keywords), '-');
 
-                                               if (defined($sub) && length($sub)) {
-                                                       $line += () = $sub =~ /\n/g;
+                       print $xgettext_in $source;
+                       close $xgettext_in;
 
-                                                       $sub =~ s/^"//;
-                                                       $sub =~ s/"$//;
-                                                       $res .= $sub;
-                                               }
-                                       }
-                               }
+                       my $pot = readline $xgettext_out;
+                       close $xgettext_out;
 
-                               if (defined($res))
-                               {
-                                       $res = dec_json_str($res);
+                       waitpid $pid, 0;
 
-                                       if ($res) {
-                                               $stringtable{$res} ||= [ ];
-                                               push @{$stringtable{$res}}, $position;
-                                       }
-                               }
-                       }
+                       print $msguniq_in postprocess_pot($file, $pot);
                }
        }
 
        close F;
 }
 
+close $msguniq_in;
 
-if( open C, "| msgcat -" )
-{
-       binmode C, ':utf8';
+my @pot = <$msguniq_out>;
 
-       printf C "msgid \"\"\nmsgstr \"Content-Type: text/plain; charset=UTF-8\"\n\n";
+close $msguniq_out;
+waitpid $msguniq_pid, 0;
 
-       foreach my $key ( sort keys %stringtable )
-       {
-               if( length $key )
-               {
-                       my @positions =
-                               map { join ':', @$_ }
-                               sort { ($a->[0] cmp $b->[0]) || ($a->[1] <=> $b->[1]) }
-                               map { [ /^(.+):(\d+)$/ ] }
-                               @{$stringtable{$key}};
-
-                       $key =~ s/\\/\\\\/g;
-                       $key =~ s/\n/\\n/g;
-                       $key =~ s/\t/\\t/g;
-                       $key =~ s/"/\\"/g;
-
-                       printf C "#: %s\nmsgid \"%s\"\nmsgstr \"\"\n\n",
-                               join(' ', @positions), $key;
+while (@pot > 0) {
+       my $line = shift @pot;
+
+       # Reorder the location comments in a detemrinistic way to
+       # reduce SCM noise when frequently updating templates.
+       if ($line =~ m!^#: !) {
+               my @locs = ($line);
+
+               while (@pot > 0 && $pot[0] =~ m!^#: !) {
+                       push @locs, shift @pot;
                }
+
+               print
+                       map { join(':', @$_) . "\n" }
+                       sort { ($a->[0] cmp $b->[0]) || ($a->[1] <=> $b->[1]) }
+                       map { [ /^(.+):(\d+)$/ ] }
+                       @locs
+               ;
+
+               next;
        }
 
-       close C;
+       print $line;
 }