Ver código fonte

Support multi-byte --transform='...\L...' etc

Support upcasing and downcasing in multi-byte locales.
* gnulib.modules: Add c32rtomb, c32tolower, c32toupper,
mbrtoc32-regular.
* src/transform.c: Do not include ctype.h.  Include mcel.h.
(stk, stk_init): Move up.
(run_case_conv): Return void, not char *.  Append result to
stk directly; this avoids the need for a separate allocation.
All callers changed.  Do not assume a single-byte locale.
* tests/xform04.at: New test.
* tests/Makefile.am (TESTSUITE_AT):
* tests/testsuite.at: Add it.
Paul Eggert 1 ano atrás
pai
commit
c1e277476c
6 arquivos alterados com 97 adições e 59 exclusões
  1. 4 1
      NEWS
  2. 4 0
      gnulib.modules
  3. 38 57
      src/transform.c
  4. 2 1
      tests/Makefile.am
  5. 1 0
      tests/testsuite.at
  6. 48 0
      tests/xform04.at

+ 4 - 1
NEWS

@@ -1,4 +1,4 @@
-GNU tar NEWS - User visible changes. 2023-09-10
+GNU tar NEWS - User visible changes. 2023-09-12
 Please send GNU tar bug reports to <bug-tar@gnu.org>
 
 version TBD
@@ -33,6 +33,9 @@ used, command output will be parsed using strptime(3).
 
 ** When diagnosing invalid extended headers tar now quotes control characters.
 
+** Transformations that change case (e.g., --transform='s/.*/\L&/')
+   now work correctly with multi-byte characters.
+
 
 version 1.35 - Sergey Poznyakoff, 2023-07-18
 

+ 4 - 0
gnulib.modules

@@ -25,6 +25,9 @@ argp-version-etc
 attribute
 backupfile
 c-ctype
+c32rtomb
+c32tolower
+c32toupper
 closeout
 configmake
 dirname
@@ -64,6 +67,7 @@ lchown
 linkat
 localcharset
 manywarnings
+mbrtoc32-regular
 mcel-prefer
 mkdirat
 mkdtemp

+ 38 - 57
src/transform.c

@@ -15,8 +15,8 @@
    with this program.  If not, see <http://www.gnu.org/licenses/>.  */
 
 #include <system.h>
-#include <ctype.h>
 #include <regex.h>
+#include <mcel.h>
 #include "common.h"
 
 enum transform_type
@@ -417,51 +417,44 @@ set_transform_expr (const char *expr)
     expr = parse_transform_expr (expr);
 }
 
+
+static struct obstack stk;
+static bool stk_init;
+
 /* Run case conversion specified by CASE_CTL on array PTR of SIZE
-   characters. Returns pointer to statically allocated storage. */
-static char *
+   characters.  Append the result to STK.  */
+static void
 run_case_conv (enum case_ctl_type case_ctl, char *ptr, size_t size)
 {
-  static char *case_ctl_buffer;
-  static size_t case_ctl_bufsize;
-  char *p;
-
-  if (case_ctl_bufsize < size)
+  char const *p = ptr, *plim = ptr + size;
+  mbstate_t mbs; mbszero (&mbs);
+  while (p < plim)
     {
-      case_ctl_bufsize = size;
-      case_ctl_buffer = xrealloc (case_ctl_buffer, case_ctl_bufsize);
+      mcel_t g = mcel_scan (p, plim);
+      char32_t ch;
+      switch (case_ctl)
+	{
+	case ctl_upcase: case ctl_upcase_next: ch = c32toupper (g.ch); break;
+	case ctl_locase: case ctl_locase_next: ch = c32tolower (g.ch); break;
+	default: ch = g.ch; break;
+	}
+      if (ch == g.ch)
+	obstack_grow (&stk, p, g.len);
+      else
+	{
+	  obstack_make_room (&stk, MB_LEN_MAX);
+	  mbstate_t ombs; mbszero (&ombs);
+	  size_t outbytes = c32rtomb (obstack_next_free (&stk), ch, &ombs);
+	  obstack_blank_fast (&stk, outbytes);
+	}
+      p += g.len;
+      if (case_ctl != ctl_upcase && case_ctl != ctl_locase)
+	break;
     }
-  memcpy (case_ctl_buffer, ptr, size);
-  switch (case_ctl)
-    {
-    case ctl_upcase_next:
-      case_ctl_buffer[0] = toupper ((unsigned char) case_ctl_buffer[0]);
-      break;
-
-    case ctl_locase_next:
-      case_ctl_buffer[0] = tolower ((unsigned char) case_ctl_buffer[0]);
-      break;
 
-    case ctl_upcase:
-      for (p = case_ctl_buffer; p < case_ctl_buffer + size; p++)
-	*p = toupper ((unsigned char) *p);
-      break;
-
-    case ctl_locase:
-      for (p = case_ctl_buffer; p < case_ctl_buffer + size; p++)
-	*p = tolower ((unsigned char) *p);
-      break;
-
-    case ctl_stop:
-      break;
-    }
-  return case_ctl_buffer;
+  obstack_grow (&stk, p, plim - p);
 }
 
-
-static struct obstack stk;
-static bool stk_init;
-
 static void
 _single_transform_name_to_obstack (struct transform *tf, char *input)
 {
@@ -484,7 +477,6 @@ _single_transform_name_to_obstack (struct transform *tf, char *input)
   while (*input)
     {
       size_t disp;
-      char *ptr;
 
       rc = regexec (&tf->regex, input, tf->regex.re_nsub + 1, rmp, 0);
 
@@ -510,16 +502,10 @@ _single_transform_name_to_obstack (struct transform *tf, char *input)
 	      switch (segm->type)
 		{
 		case segm_literal:    /* Literal segment */
-		  if (case_ctl == ctl_stop)
-		    ptr = segm->v.literal.ptr;
-		  else
-		    {
-		      ptr = run_case_conv (case_ctl,
-					   segm->v.literal.ptr,
-					   segm->v.literal.size);
-		      CASE_CTL_RESET();
-		    }
-		  obstack_grow (&stk, ptr, segm->v.literal.size);
+		  run_case_conv (case_ctl,
+				 segm->v.literal.ptr,
+				 segm->v.literal.size);
+		  CASE_CTL_RESET ();
 		  break;
 
 		case segm_backref:    /* Back-reference segment */
@@ -528,14 +514,9 @@ _single_transform_name_to_obstack (struct transform *tf, char *input)
 		    {
 		      size_t size = rmp[segm->v.ref].rm_eo
 			              - rmp[segm->v.ref].rm_so;
-		      ptr = input + rmp[segm->v.ref].rm_so;
-		      if (case_ctl != ctl_stop)
-			{
-			  ptr = run_case_conv (case_ctl, ptr, size);
-			  CASE_CTL_RESET();
-			}
-
-		      obstack_grow (&stk, ptr, size);
+		      run_case_conv (case_ctl,
+				     input + rmp[segm->v.ref].rm_so, size);
+		      CASE_CTL_RESET ();
 		    }
 		  break;
 

+ 2 - 1
tests/Makefile.am

@@ -287,7 +287,8 @@ TESTSUITE_AT = \
  xform-h.at\
  xform01.at\
  xform02.at\
- xform03.at
+ xform03.at\
+ xform04.at
 
 distclean-local:
 	-rm -rf download

+ 1 - 0
tests/testsuite.at

@@ -293,6 +293,7 @@ m4_include([xform-h.at])
 m4_include([xform01.at])
 m4_include([xform02.at])
 m4_include([xform03.at])
+m4_include([xform04.at])
 
 AT_BANNER([Exclude])
 m4_include([exclude.at])

+ 48 - 0
tests/xform04.at

@@ -0,0 +1,48 @@
+# Process this file with autom4te to create testsuite. -*- Autotest -*-
+
+# Test suite for GNU tar.
+# Copyright 2023 Free Software Foundation, Inc.
+
+# This file is part of GNU tar.
+
+# GNU tar is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+
+# GNU tar is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+# Transformations can change the number of bytes when downcasing.
+
+AT_SETUP([transformations and multi-byte downcasing])
+AT_KEYWORDS([transform xform xform04])
+
+AT_TAR_CHECK([
+if test "`(locale charmap) 2>/dev/null`" != UTF-8; then
+  for locale in en_US.UTF-8 `(locale -a) 2>/dev/null` not-found; do
+    case $locale in
+      *.[[Uu][Tt][Ff]]*8)
+	if test "`(LC_ALL=$locale locale charmap) 2>/dev/null`" = UTF-8; then
+	  LC_ALL=$locale
+	  export LC_ALL
+	  break
+	fi;;
+      not-found)
+	AT_SKIP_TEST;;
+    esac
+  done
+fi
+
+genfile --file Aa.Ⱥⱥ
+tar -cvf /dev/null --transform='s/.*/\L&-\U&/' --show-transformed-name Aa.Ⱥⱥ],
+[0],
+[aa.ⱥⱥ-AA.ȺȺ
+])
+
+AT_CLEANUP