Browse Source

Avoid quadratic behavior with delayed links

Do this by searching a hash table instead of a linked list.
Problem reported by Martin Dørum in https://mort.coffee/home/tar/
via Gavin Smith in:
https://lists.gnu.org/r/bug-tar/2022-07/msg00003.html
* src/extract.c: Include hash.h.
Improve performance a bit on non-birthtime hosts
(struct delayed_link.has_predecessor): New member.
(delayed_link_head): Remove, replacing with ...
(delayed_link_table): ... this new variable.  All uses
of linked list replaced with hash table.
(dl_hash, dl_compare): New functions for hash table.
(create_placeholder_file): Initialize has_predecessor.
(apply_delayed_link): New function, with body taken from
most of the old apply_delayed_link.
(apply_delayed_links): Use it.  Respect has_predecessor.
Don’t bother freeing as we are about to exit.
Paul Eggert 2 years ago
parent
commit
258d1c44e5
1 changed files with 139 additions and 94 deletions
  1. 139 94
      src/extract.c

+ 139 - 94
src/extract.c

@@ -22,6 +22,7 @@
 #include <system.h>
 #include <system.h>
 #include <quotearg.h>
 #include <quotearg.h>
 #include <errno.h>
 #include <errno.h>
+#include <hash.h>
 #include <priv-set.h>
 #include <priv-set.h>
 #include <root-uid.h>
 #include <root-uid.h>
 #include <utimens.h>
 #include <utimens.h>
@@ -128,12 +129,16 @@ struct delayed_set_stat
 
 
 static struct delayed_set_stat *delayed_set_stat_head;
 static struct delayed_set_stat *delayed_set_stat_head;
 
 
-/* List of links whose creation we have delayed.  */
+/* A link whose creation we have delayed.  */
 struct delayed_link
 struct delayed_link
   {
   {
-    /* The next delayed link in the list.  */
+    /* The next in a list of delayed links that should be made after
+       this delayed link.  */
     struct delayed_link *next;
     struct delayed_link *next;
 
 
+    /* Whether this delayed link has a predecessor in the NEXT list.  */
+    bool has_predecessor;
+
     /* The device, inode number and birthtime of the placeholder.
     /* The device, inode number and birthtime of the placeholder.
        birthtime.tv_nsec is negative if the birthtime is not available.
        birthtime.tv_nsec is negative if the birthtime is not available.
        Don't use mtime as this would allow for false matches if some
        Don't use mtime as this would allow for false matches if some
@@ -178,7 +183,7 @@ struct delayed_link
     char target[1];
     char target[1];
   };
   };
 
 
-static struct delayed_link *delayed_link_head;
+static Hash_table *delayed_link_table;
 
 
 struct string_list
 struct string_list
   {
   {
@@ -186,6 +191,25 @@ struct string_list
     char string[1];
     char string[1];
   };
   };
 
 
+static size_t
+dl_hash (void const *entry, size_t table_size)
+{
+  struct delayed_link const *dl = entry;
+  uintmax_t n = dl->dev;
+  int nshift = TYPE_WIDTH (n) - TYPE_WIDTH (dl->dev);
+  if (0 < nshift)
+    n <<= nshift;
+  n ^= dl->ino;
+  return n % table_size;
+}
+
+static bool
+dl_compare (void const *a, void const *b)
+{
+  struct delayed_link const *da = a, *db = b;
+  return (da->dev == db->dev) & (da->ino == db->ino);
+}
+
 /*  Set up to extract files.  */
 /*  Set up to extract files.  */
 void
 void
 extr_init (void)
 extr_init (void)
@@ -1349,23 +1373,21 @@ extract_file (char *file_name, int typeflag)
 }
 }
 
 
 /* Find a delayed_link structure corresponding to the source NAME.
 /* Find a delayed_link structure corresponding to the source NAME.
-   Such a structure exists in the delayed link list only if the link
+   Such a structure exists in the delayed link table only if the link
    placeholder file has been created. Therefore, try to stat the NAME
    placeholder file has been created. Therefore, try to stat the NAME
-   first. If it doesn't exist, there is no matching entry in the list.
-   Otherwise, look for the entry in list which has the matching dev
-   and ino numbers.
+   first. If it doesn't exist, there is no matching entry in the table.
+   Otherwise, look for the entry in the table that has the matching dev
+   and ino numbers.  Return a null pointer if not found.
 
 
-   This approach avoids scanning the singly-linked list in obvious cases
-   and does not rely on comparing file names, which may differ for
+   Do not rely on comparing file names, which may differ for
    various reasons (e.g. relative vs. absolute file names).
    various reasons (e.g. relative vs. absolute file names).
  */
  */
 static struct delayed_link *
 static struct delayed_link *
 find_delayed_link_source (char const *name)
 find_delayed_link_source (char const *name)
 {
 {
-  struct delayed_link *dl;
   struct stat st;
   struct stat st;
 
 
-  if (!delayed_link_head)
+  if (!delayed_link_table)
     return NULL;
     return NULL;
 
 
   if (fstatat (chdir_fd, name, &st, AT_SYMLINK_NOFOLLOW))
   if (fstatat (chdir_fd, name, &st, AT_SYMLINK_NOFOLLOW))
@@ -1375,12 +1397,10 @@ find_delayed_link_source (char const *name)
       return NULL;
       return NULL;
     }
     }
 
 
-  for (dl = delayed_link_head; dl; dl = dl->next)
-    {
-      if (dl->dev == st.st_dev && dl->ino == st.st_ino)
-	break;
-    }
-  return dl;
+  struct delayed_link dl;
+  dl.dev = st.st_dev;
+  dl.ino = st.st_ino;
+  return hash_lookup (delayed_link_table, &dl);
 }
 }
 
 
 /* Create a placeholder file with name FILE_NAME, which will be
 /* Create a placeholder file with name FILE_NAME, which will be
@@ -1388,9 +1408,7 @@ find_delayed_link_source (char const *name)
    IS_SYMLINK is true, and by a hard link otherwise.  Set
    IS_SYMLINK is true, and by a hard link otherwise.  Set
    *INTERDIR_MADE if an intermediate directory is made in the
    *INTERDIR_MADE if an intermediate directory is made in the
    process.
    process.
-   Install the created struct delayed_link after PREV, unless the
-   latter is NULL, in which case insert it at the head of the delayed
-   link list.
+   If PREV, install the created struct delayed_link after PREV.
 */
 */
 
 
 static int
 static int
@@ -1443,12 +1461,14 @@ create_placeholder_file (char *file_name, bool is_symlink, bool *interdir_made,
 	{
 	{
 	  p->next = prev->next;
 	  p->next = prev->next;
 	  prev->next = p;
 	  prev->next = p;
+	  p->has_predecessor = true;
 	}
 	}
       else
       else
 	{
 	{
-	  p->next = delayed_link_head;
-	  delayed_link_head = p;
+	  p->next = NULL;
+	  p->has_predecessor = false;
 	}
 	}
+
       p->dev = st.st_dev;
       p->dev = st.st_dev;
       p->ino = st.st_ino;
       p->ino = st.st_ino;
 #if HAVE_BIRTHTIME
 #if HAVE_BIRTHTIME
@@ -1478,6 +1498,12 @@ create_placeholder_file (char *file_name, bool is_symlink, bool *interdir_made,
       xattr_map_copy (&p->xattr_map, &current_stat_info.xattr_map);
       xattr_map_copy (&p->xattr_map, &current_stat_info.xattr_map);
       strcpy (p->target, current_stat_info.link_name);
       strcpy (p->target, current_stat_info.link_name);
 
 
+      if (! ((delayed_link_table
+	      || (delayed_link_table = hash_initialize (0, 0, dl_hash,
+							dl_compare, free)))
+	     && hash_insert (delayed_link_table, p)))
+	xalloc_die ();
+
       if ((h = find_direct_ancestor (file_name)) != NULL)
       if ((h = find_direct_ancestor (file_name)) != NULL)
 	mark_after_links (h);
 	mark_after_links (h);
 
 
@@ -1512,13 +1538,14 @@ extract_link (char *file_name, int typeflag)
 
 
       if (status == 0)
       if (status == 0)
 	{
 	{
-	  struct delayed_link *ds = delayed_link_head;
-	  if (ds
+	  if (delayed_link_table
 	      && fstatat (chdir_fd, link_name, &st1, AT_SYMLINK_NOFOLLOW) == 0)
 	      && fstatat (chdir_fd, link_name, &st1, AT_SYMLINK_NOFOLLOW) == 0)
-	    for (; ds; ds = ds->next)
-	      if (ds->change_dir == chdir_current
-		  && ds->dev == st1.st_dev
-		  && ds->ino == st1.st_ino
+	    {
+	      struct delayed_link dl1;
+	      dl1.ino = st1.st_ino;
+	      dl1.dev = st1.st_dev;
+	      struct delayed_link *ds = hash_lookup (delayed_link_table, &dl1);
+	      if (ds && ds->change_dir == chdir_current
 		  && BIRTHTIME_EQ (ds->birthtime, get_stat_birthtime (&st1)))
 		  && BIRTHTIME_EQ (ds->birthtime, get_stat_birthtime (&st1)))
 		{
 		{
 		  struct string_list *p =  xmalloc (offsetof (struct string_list, string)
 		  struct string_list *p =  xmalloc (offsetof (struct string_list, string)
@@ -1526,8 +1553,9 @@ extract_link (char *file_name, int typeflag)
 		  strcpy (p->string, file_name);
 		  strcpy (p->string, file_name);
 		  p->next = ds->sources;
 		  p->next = ds->sources;
 		  ds->sources = p;
 		  ds->sources = p;
-		  break;
 		}
 		}
+	    }
+
 	  return 0;
 	  return 0;
 	}
 	}
       else if ((e == EEXIST && strcmp (link_name, file_name) == 0)
       else if ((e == EEXIST && strcmp (link_name, file_name) == 0)
@@ -1853,86 +1881,103 @@ extract_archive (void)
     undo_last_backup ();
     undo_last_backup ();
 }
 }
 
 
-/* Extract the links whose final extraction were delayed.  */
+/* Extract the link DS whose final extraction was delayed.  */
 static void
 static void
-apply_delayed_links (void)
+apply_delayed_link (struct delayed_link *ds)
 {
 {
-  struct delayed_link *ds;
+  struct string_list *sources = ds->sources;
+  char const *valid_source = 0;
 
 
-  for (ds = delayed_link_head; ds; )
-    {
-      struct string_list *sources = ds->sources;
-      char const *valid_source = 0;
+  chdir_do (ds->change_dir);
 
 
-      chdir_do (ds->change_dir);
+  for (sources = ds->sources; sources; sources = sources->next)
+    {
+      char const *source = sources->string;
+      struct stat st;
 
 
-      for (sources = ds->sources; sources; sources = sources->next)
+      /* Make sure the placeholder file is still there.  If not,
+	 don't create a link, as the placeholder was probably
+	 removed by a later extraction.  */
+      if (fstatat (chdir_fd, source, &st, AT_SYMLINK_NOFOLLOW) == 0
+	  && st.st_dev == ds->dev
+	  && st.st_ino == ds->ino
+	  && BIRTHTIME_EQ (get_stat_birthtime (&st), ds->birthtime))
 	{
 	{
-	  char const *source = sources->string;
-	  struct stat st;
-
-	  /* Make sure the placeholder file is still there.  If not,
-	     don't create a link, as the placeholder was probably
-	     removed by a later extraction.  */
-	  if (fstatat (chdir_fd, source, &st, AT_SYMLINK_NOFOLLOW) == 0
-	      && st.st_dev == ds->dev
-	      && st.st_ino == ds->ino
-	      && BIRTHTIME_EQ (get_stat_birthtime (&st), ds->birthtime))
+	  /* Unlink the placeholder, then create a hard link if possible,
+	     a symbolic link otherwise.  */
+	  if (unlinkat (chdir_fd, source, 0) != 0)
+	    unlink_error (source);
+	  else if (valid_source
+		   && (linkat (chdir_fd, valid_source, chdir_fd, source, 0)
+		       == 0))
+	    ;
+	  else if (!ds->is_symlink)
 	    {
 	    {
-	      /* Unlink the placeholder, then create a hard link if possible,
-		 a symbolic link otherwise.  */
-	      if (unlinkat (chdir_fd, source, 0) != 0)
-		unlink_error (source);
-	      else if (valid_source
-		       && (linkat (chdir_fd, valid_source, chdir_fd, source, 0)
-			   == 0))
-		;
-	      else if (!ds->is_symlink)
-		{
-		  if (linkat (chdir_fd, ds->target, chdir_fd, source, 0) != 0)
-		    link_error (ds->target, source);
-		}
-	      else if (symlinkat (ds->target, chdir_fd, source) != 0)
-		symlink_error (ds->target, source);
-	      else
-		{
-		  struct tar_stat_info st1;
-		  st1.stat.st_mode = ds->mode;
-		  st1.stat.st_uid = ds->uid;
-		  st1.stat.st_gid = ds->gid;
-		  st1.atime = ds->atime;
-		  st1.mtime = ds->mtime;
-                  st1.cntx_name = ds->cntx_name;
-                  st1.acls_a_ptr = ds->acls_a_ptr;
-                  st1.acls_a_len = ds->acls_a_len;
-                  st1.acls_d_ptr = ds->acls_d_ptr;
-                  st1.acls_d_len = ds->acls_d_len;
-                  st1.xattr_map = ds->xattr_map;
-		  set_stat (source, &st1, -1, 0, 0, SYMTYPE,
-			    false, AT_SYMLINK_NOFOLLOW);
-		  valid_source = source;
-		}
+	      if (linkat (chdir_fd, ds->target, chdir_fd, source, 0) != 0)
+		link_error (ds->target, source);
+	    }
+	  else if (symlinkat (ds->target, chdir_fd, source) != 0)
+	    symlink_error (ds->target, source);
+	  else
+	    {
+	      struct tar_stat_info st1;
+	      st1.stat.st_mode = ds->mode;
+	      st1.stat.st_uid = ds->uid;
+	      st1.stat.st_gid = ds->gid;
+	      st1.atime = ds->atime;
+	      st1.mtime = ds->mtime;
+	      st1.cntx_name = ds->cntx_name;
+	      st1.acls_a_ptr = ds->acls_a_ptr;
+	      st1.acls_a_len = ds->acls_a_len;
+	      st1.acls_d_ptr = ds->acls_d_ptr;
+	      st1.acls_d_len = ds->acls_d_len;
+	      st1.xattr_map = ds->xattr_map;
+	      set_stat (source, &st1, -1, 0, 0, SYMTYPE,
+			false, AT_SYMLINK_NOFOLLOW);
+	      valid_source = source;
 	    }
 	    }
 	}
 	}
+    }
 
 
-      for (sources = ds->sources; sources; )
-	{
-	  struct string_list *next = sources->next;
-	  free (sources);
-	  sources = next;
-	}
+  for (sources = ds->sources; sources; )
+    {
+      struct string_list *next = sources->next;
+      free (sources);
+      sources = next;
+    }
+
+  xattr_map_free (&ds->xattr_map);
+  free (ds->cntx_name);
+}
 
 
-      xattr_map_free (&ds->xattr_map);
-      free (ds->cntx_name);
+/* Extract the links whose final extraction were delayed.  */
+static void
+apply_delayed_links (void)
+{
+  if (!delayed_link_table)
+    return;
 
 
+  for (struct delayed_link *dl = hash_get_first (delayed_link_table);
+       dl;
+       dl = dl->next ? dl->next : hash_get_next (delayed_link_table, dl))
+    if (!dl->has_predecessor)
       {
       {
-	struct delayed_link *next = ds->next;
-	free (ds);
-	ds = next;
+	struct delayed_link *ds = dl;
+	do
+	  {
+	    apply_delayed_link (ds);
+	    ds = ds->next;
+	  }
+	while (ds);
       }
       }
-    }
 
 
-  delayed_link_head = 0;
+  if (false)
+    {
+      /* There is little point to freeing, as we are about to exit,
+	 and freeing is more likely to cause than cure trouble.  */
+      hash_free (delayed_link_table);
+      delayed_link_table = NULL;
+    }
 }
 }
 
 
 /* Finish the extraction of an archive.  */
 /* Finish the extraction of an archive.  */