bug-wget
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Bug-wget] bad filenames (again)


From: Andries E. Brouwer
Subject: Re: [Bug-wget] bad filenames (again)
Date: Sun, 9 Aug 2015 22:08:34 +0200
User-agent: Mutt/1.5.21 (2010-09-15)

On Fri, Aug 07, 2015 at 05:13:19PM +0200, Tim Ruehsen wrote:

> The solution would something like
> 
> if locale is UTF-8
>   do not escape valid UTF-8 sequences
> else
>   keep wget's current behavior

> If you provide patch for this we will appreciate that.

OK - a first version of such a patch.
This splits the restrict_control into two halves.
The low control is as before.
The high control is permitted by default on a Unix system
with something that looks like an UTF-8 locale.
For Windows the behavior is unchanged.

Andries

Test: fetch http://he.wikipedia.org/wiki/הרפש_.ש


diff -ru wget-1.16.3/src/init.c wget-1.16.3a/src/init.c
--- wget-1.16.3/src/init.c      2015-01-31 00:25:57.000000000 +0100
+++ wget-1.16.3a/src/init.c     2015-08-09 21:44:54.260215105 +0200
@@ -333,6 +333,27 @@
   return -1;
 }
 
+
+/* Used to determine whether bytes 128-159 are OK in a filename */
+static int
+have_utf8_locale() {
+#if defined(WINDOWS) || defined(MSDOS) || defined(__CYGWIN__)
+  /* insert some test for Windows */
+#else
+  char *p;
+
+  p = getenv("LC_ALL");
+  if (p == NULL)
+    p = getenv("LC_CTYPE");
+  if (p == NULL)
+    p = getenv("LANG");
+  if (strstr(p, "UTF-8") != NULL || strstr(p, "UTF8") != NULL ||
+      strstr(p, "utf-8") != NULL || strstr(p, "utf8") != NULL)
+    return true;
+#endif
+  return false;
+}
+
 /* Reset the variables to default values.  */
 void
 defaults (void)
@@ -401,6 +422,7 @@
   opt.restrict_files_os = restrict_unix;
 #endif
   opt.restrict_files_ctrl = true;
+  opt.restrict_files_highctrl = (have_utf8_locale() ? false : true);
   opt.restrict_files_nonascii = false;
   opt.restrict_files_case = restrict_no_case_restriction;
 
@@ -1466,6 +1488,7 @@
 {
   int restrict_os = opt.restrict_files_os;
   int restrict_ctrl = opt.restrict_files_ctrl;
+  int restrict_highctrl = opt.restrict_files_highctrl;
   int restrict_case = opt.restrict_files_case;
   int restrict_nonascii = opt.restrict_files_nonascii;
 
@@ -1488,7 +1511,7 @@
       else if (VAL_IS ("uppercase"))
         restrict_case = restrict_uppercase;
       else if (VAL_IS ("nocontrol"))
-        restrict_ctrl = false;
+        restrict_ctrl = restrict_highctrl = false;
       else if (VAL_IS ("ascii"))
         restrict_nonascii = true;
       else
@@ -1509,6 +1532,7 @@
 
   opt.restrict_files_os = restrict_os;
   opt.restrict_files_ctrl = restrict_ctrl;
+  opt.restrict_files_highctrl = restrict_highctrl;
   opt.restrict_files_case = restrict_case;
   opt.restrict_files_nonascii = restrict_nonascii;
 
diff -ru wget-1.16.3/src/options.h wget-1.16.3a/src/options.h
--- wget-1.16.3/src/options.h   2015-01-31 00:25:57.000000000 +0100
+++ wget-1.16.3a/src/options.h  2015-08-09 21:22:35.984186065 +0200
@@ -244,6 +244,7 @@
   bool restrict_files_ctrl;     /* non-zero if control chars in URLs
                                    are restricted from appearing in
                                    generated file names. */
+  bool restrict_files_highctrl; /* idem for bytes 128-159 */
   bool restrict_files_nonascii; /* non-zero if bytes with values greater
                                    than 127 are restricted. */
   enum {
diff -ru wget-1.16.3/src/url.c wget-1.16.3a/src/url.c
--- wget-1.16.3/src/url.c       2015-02-23 16:10:22.000000000 +0100
+++ wget-1.16.3a/src/url.c      2015-08-09 21:14:34.876175626 +0200
@@ -1329,7 +1329,8 @@
 enum {
   filechr_not_unix    = 1,      /* unusable on Unix, / and \0 */
   filechr_not_windows = 2,      /* unusable on Windows, one of \|/<>?:*" */
-  filechr_control     = 4       /* a control character, e.g. 0-31 */
+  filechr_control     = 4,      /* a control character, e.g. 0-31 */
+  filechr_highcontrol = 8      /* a high control character, in 128-159 */
 };
 
 #define FILE_CHAR_TEST(c, mask) \
@@ -1340,6 +1341,7 @@
 #define U filechr_not_unix
 #define W filechr_not_windows
 #define C filechr_control
+#define Z filechr_highcontrol
 
 #define UW U|W
 #define UWC U|W|C
@@ -1370,8 +1372,8 @@
   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
   0,  0,  0,  0,   W,  0,  0,  C,   /* x   y   z   {    |   }   ~   DEL */
 
-  C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 128-143 */
-  C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 144-159 */
+  Z, Z, Z, Z,  Z, Z, Z, Z,  Z, Z, Z, Z,  Z, Z, Z, Z, /* 128-143 */
+  Z, Z, Z, Z,  Z, Z, Z, Z,  Z, Z, Z, Z,  Z, Z, Z, Z, /* 144-159 */
   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
 
@@ -1383,6 +1385,7 @@
 #undef U
 #undef W
 #undef C
+#undef Z
 #undef UW
 #undef UWC
 
@@ -1417,8 +1420,11 @@
     mask = filechr_not_unix;
   else
     mask = filechr_not_windows;
+
   if (opt.restrict_files_ctrl)
     mask |= filechr_control;
+  if (opt.restrict_files_highctrl)
+    mask |= filechr_highcontrol;
 
   /* Copy [b, e) to PATHEL and URL-unescape it. */
   if (escaped)




reply via email to

[Prev in Thread] Current Thread [Next in Thread]