Browse Source

Avoid quadratic behavior with delayed links

Do this by searching a hash table instead of a linked list.
Problem reported by Martin Dørum in https://mort.coffee/home/tar/
via Gavin Smith in:
https://lists.gnu.org/r/bug-tar/2022-07/msg00003.html
* src/extract.c: Include hash.h.
Improve performance a bit on non-birthtime hosts
(struct delayed_link.has_predecessor): New member.
(delayed_link_head): Remove, replacing with ...
(delayed_link_table): ... this new variable.  All uses
of linked list replaced with hash table.
(dl_hash, dl_compare): New functions for hash table.
(create_placeholder_file): Initialize has_predecessor.
(apply_delayed_link): New function, with body taken from
most of the old apply_delayed_link.
(apply_delayed_links): Use it.  Respect has_predecessor.
Don’t bother freeing as we are about to exit.
Paul Eggert 2 years ago
parent
commit
258d1c44e5
1 changed files with 139 additions and 94 deletions
  1. 139 94
      src/extract.c

+ 139 - 94
src/extract.c

@@ -22,6 +22,7 @@
 #include <system.h>
 #include <quotearg.h>
 #include <errno.h>
+#include <hash.h>
 #include <priv-set.h>
 #include <root-uid.h>
 #include <utimens.h>
@@ -128,12 +129,16 @@ struct delayed_set_stat
 
 static struct delayed_set_stat *delayed_set_stat_head;
 
-/* List of links whose creation we have delayed.  */
+/* A link whose creation we have delayed.  */
 struct delayed_link
   {
-    /* The next delayed link in the list.  */
+    /* The next in a list of delayed links that should be made after
+       this delayed link.  */
     struct delayed_link *next;
 
+    /* Whether this delayed link has a predecessor in the NEXT list.  */
+    bool has_predecessor;
+
     /* The device, inode number and birthtime of the placeholder.
        birthtime.tv_nsec is negative if the birthtime is not available.
        Don't use mtime as this would allow for false matches if some
@@ -178,7 +183,7 @@ struct delayed_link
     char target[1];
   };
 
-static struct delayed_link *delayed_link_head;
+static Hash_table *delayed_link_table;
 
 struct string_list
   {
@@ -186,6 +191,25 @@ struct string_list
     char string[1];
   };
 
+static size_t
+dl_hash (void const *entry, size_t table_size)
+{
+  struct delayed_link const *dl = entry;
+  uintmax_t n = dl->dev;
+  int nshift = TYPE_WIDTH (n) - TYPE_WIDTH (dl->dev);
+  if (0 < nshift)
+    n <<= nshift;
+  n ^= dl->ino;
+  return n % table_size;
+}
+
+static bool
+dl_compare (void const *a, void const *b)
+{
+  struct delayed_link const *da = a, *db = b;
+  return (da->dev == db->dev) & (da->ino == db->ino);
+}
+
 /*  Set up to extract files.  */
 void
 extr_init (void)
@@ -1349,23 +1373,21 @@ extract_file (char *file_name, int typeflag)
 }
 
 /* Find a delayed_link structure corresponding to the source NAME.
-   Such a structure exists in the delayed link list only if the link
+   Such a structure exists in the delayed link table only if the link
    placeholder file has been created. Therefore, try to stat the NAME
-   first. If it doesn't exist, there is no matching entry in the list.
-   Otherwise, look for the entry in list which has the matching dev
-   and ino numbers.
+   first. If it doesn't exist, there is no matching entry in the table.
+   Otherwise, look for the entry in the table that has the matching dev
+   and ino numbers.  Return a null pointer if not found.
 
-   This approach avoids scanning the singly-linked list in obvious cases
-   and does not rely on comparing file names, which may differ for
+   Do not rely on comparing file names, which may differ for
    various reasons (e.g. relative vs. absolute file names).
  */
 static struct delayed_link *
 find_delayed_link_source (char const *name)
 {
-  struct delayed_link *dl;
   struct stat st;
 
-  if (!delayed_link_head)
+  if (!delayed_link_table)
     return NULL;
 
   if (fstatat (chdir_fd, name, &st, AT_SYMLINK_NOFOLLOW))
@@ -1375,12 +1397,10 @@ find_delayed_link_source (char const *name)
       return NULL;
     }
 
-  for (dl = delayed_link_head; dl; dl = dl->next)
-    {
-      if (dl->dev == st.st_dev && dl->ino == st.st_ino)
-	break;
-    }
-  return dl;
+  struct delayed_link dl;
+  dl.dev = st.st_dev;
+  dl.ino = st.st_ino;
+  return hash_lookup (delayed_link_table, &dl);
 }
 
 /* Create a placeholder file with name FILE_NAME, which will be
@@ -1388,9 +1408,7 @@ find_delayed_link_source (char const *name)
    IS_SYMLINK is true, and by a hard link otherwise.  Set
    *INTERDIR_MADE if an intermediate directory is made in the
    process.
-   Install the created struct delayed_link after PREV, unless the
-   latter is NULL, in which case insert it at the head of the delayed
-   link list.
+   If PREV, install the created struct delayed_link after PREV.
 */
 
 static int
@@ -1443,12 +1461,14 @@ create_placeholder_file (char *file_name, bool is_symlink, bool *interdir_made,
 	{
 	  p->next = prev->next;
 	  prev->next = p;
+	  p->has_predecessor = true;
 	}
       else
 	{
-	  p->next = delayed_link_head;
-	  delayed_link_head = p;
+	  p->next = NULL;
+	  p->has_predecessor = false;
 	}
+
       p->dev = st.st_dev;
       p->ino = st.st_ino;
 #if HAVE_BIRTHTIME
@@ -1478,6 +1498,12 @@ create_placeholder_file (char *file_name, bool is_symlink, bool *interdir_made,
       xattr_map_copy (&p->xattr_map, &current_stat_info.xattr_map);
       strcpy (p->target, current_stat_info.link_name);
 
+      if (! ((delayed_link_table
+	      || (delayed_link_table = hash_initialize (0, 0, dl_hash,
+							dl_compare, free)))
+	     && hash_insert (delayed_link_table, p)))
+	xalloc_die ();
+
       if ((h = find_direct_ancestor (file_name)) != NULL)
 	mark_after_links (h);
 
@@ -1512,13 +1538,14 @@ extract_link (char *file_name, int typeflag)
 
       if (status == 0)
 	{
-	  struct delayed_link *ds = delayed_link_head;
-	  if (ds
+	  if (delayed_link_table
 	      && fstatat (chdir_fd, link_name, &st1, AT_SYMLINK_NOFOLLOW) == 0)
-	    for (; ds; ds = ds->next)
-	      if (ds->change_dir == chdir_current
-		  && ds->dev == st1.st_dev
-		  && ds->ino == st1.st_ino
+	    {
+	      struct delayed_link dl1;
+	      dl1.ino = st1.st_ino;
+	      dl1.dev = st1.st_dev;
+	      struct delayed_link *ds = hash_lookup (delayed_link_table, &dl1);
+	      if (ds && ds->change_dir == chdir_current
 		  && BIRTHTIME_EQ (ds->birthtime, get_stat_birthtime (&st1)))
 		{
 		  struct string_list *p =  xmalloc (offsetof (struct string_list, string)
@@ -1526,8 +1553,9 @@ extract_link (char *file_name, int typeflag)
 		  strcpy (p->string, file_name);
 		  p->next = ds->sources;
 		  ds->sources = p;
-		  break;
 		}
+	    }
+
 	  return 0;
 	}
       else if ((e == EEXIST && strcmp (link_name, file_name) == 0)
@@ -1853,86 +1881,103 @@ extract_archive (void)
     undo_last_backup ();
 }
 
-/* Extract the links whose final extraction were delayed.  */
+/* Extract the link DS whose final extraction was delayed.  */
 static void
-apply_delayed_links (void)
+apply_delayed_link (struct delayed_link *ds)
 {
-  struct delayed_link *ds;
+  struct string_list *sources = ds->sources;
+  char const *valid_source = 0;
 
-  for (ds = delayed_link_head; ds; )
-    {
-      struct string_list *sources = ds->sources;
-      char const *valid_source = 0;
+  chdir_do (ds->change_dir);
 
-      chdir_do (ds->change_dir);
+  for (sources = ds->sources; sources; sources = sources->next)
+    {
+      char const *source = sources->string;
+      struct stat st;
 
-      for (sources = ds->sources; sources; sources = sources->next)
+      /* Make sure the placeholder file is still there.  If not,
+	 don't create a link, as the placeholder was probably
+	 removed by a later extraction.  */
+      if (fstatat (chdir_fd, source, &st, AT_SYMLINK_NOFOLLOW) == 0
+	  && st.st_dev == ds->dev
+	  && st.st_ino == ds->ino
+	  && BIRTHTIME_EQ (get_stat_birthtime (&st), ds->birthtime))
 	{
-	  char const *source = sources->string;
-	  struct stat st;
-
-	  /* Make sure the placeholder file is still there.  If not,
-	     don't create a link, as the placeholder was probably
-	     removed by a later extraction.  */
-	  if (fstatat (chdir_fd, source, &st, AT_SYMLINK_NOFOLLOW) == 0
-	      && st.st_dev == ds->dev
-	      && st.st_ino == ds->ino
-	      && BIRTHTIME_EQ (get_stat_birthtime (&st), ds->birthtime))
+	  /* Unlink the placeholder, then create a hard link if possible,
+	     a symbolic link otherwise.  */
+	  if (unlinkat (chdir_fd, source, 0) != 0)
+	    unlink_error (source);
+	  else if (valid_source
+		   && (linkat (chdir_fd, valid_source, chdir_fd, source, 0)
+		       == 0))
+	    ;
+	  else if (!ds->is_symlink)
 	    {
-	      /* Unlink the placeholder, then create a hard link if possible,
-		 a symbolic link otherwise.  */
-	      if (unlinkat (chdir_fd, source, 0) != 0)
-		unlink_error (source);
-	      else if (valid_source
-		       && (linkat (chdir_fd, valid_source, chdir_fd, source, 0)
-			   == 0))
-		;
-	      else if (!ds->is_symlink)
-		{
-		  if (linkat (chdir_fd, ds->target, chdir_fd, source, 0) != 0)
-		    link_error (ds->target, source);
-		}
-	      else if (symlinkat (ds->target, chdir_fd, source) != 0)
-		symlink_error (ds->target, source);
-	      else
-		{
-		  struct tar_stat_info st1;
-		  st1.stat.st_mode = ds->mode;
-		  st1.stat.st_uid = ds->uid;
-		  st1.stat.st_gid = ds->gid;
-		  st1.atime = ds->atime;
-		  st1.mtime = ds->mtime;
-                  st1.cntx_name = ds->cntx_name;
-                  st1.acls_a_ptr = ds->acls_a_ptr;
-                  st1.acls_a_len = ds->acls_a_len;
-                  st1.acls_d_ptr = ds->acls_d_ptr;
-                  st1.acls_d_len = ds->acls_d_len;
-                  st1.xattr_map = ds->xattr_map;
-		  set_stat (source, &st1, -1, 0, 0, SYMTYPE,
-			    false, AT_SYMLINK_NOFOLLOW);
-		  valid_source = source;
-		}
+	      if (linkat (chdir_fd, ds->target, chdir_fd, source, 0) != 0)
+		link_error (ds->target, source);
+	    }
+	  else if (symlinkat (ds->target, chdir_fd, source) != 0)
+	    symlink_error (ds->target, source);
+	  else
+	    {
+	      struct tar_stat_info st1;
+	      st1.stat.st_mode = ds->mode;
+	      st1.stat.st_uid = ds->uid;
+	      st1.stat.st_gid = ds->gid;
+	      st1.atime = ds->atime;
+	      st1.mtime = ds->mtime;
+	      st1.cntx_name = ds->cntx_name;
+	      st1.acls_a_ptr = ds->acls_a_ptr;
+	      st1.acls_a_len = ds->acls_a_len;
+	      st1.acls_d_ptr = ds->acls_d_ptr;
+	      st1.acls_d_len = ds->acls_d_len;
+	      st1.xattr_map = ds->xattr_map;
+	      set_stat (source, &st1, -1, 0, 0, SYMTYPE,
+			false, AT_SYMLINK_NOFOLLOW);
+	      valid_source = source;
 	    }
 	}
+    }
 
-      for (sources = ds->sources; sources; )
-	{
-	  struct string_list *next = sources->next;
-	  free (sources);
-	  sources = next;
-	}
+  for (sources = ds->sources; sources; )
+    {
+      struct string_list *next = sources->next;
+      free (sources);
+      sources = next;
+    }
+
+  xattr_map_free (&ds->xattr_map);
+  free (ds->cntx_name);
+}
 
-      xattr_map_free (&ds->xattr_map);
-      free (ds->cntx_name);
+/* Extract the links whose final extraction were delayed.  */
+static void
+apply_delayed_links (void)
+{
+  if (!delayed_link_table)
+    return;
 
+  for (struct delayed_link *dl = hash_get_first (delayed_link_table);
+       dl;
+       dl = dl->next ? dl->next : hash_get_next (delayed_link_table, dl))
+    if (!dl->has_predecessor)
       {
-	struct delayed_link *next = ds->next;
-	free (ds);
-	ds = next;
+	struct delayed_link *ds = dl;
+	do
+	  {
+	    apply_delayed_link (ds);
+	    ds = ds->next;
+	  }
+	while (ds);
       }
-    }
 
-  delayed_link_head = 0;
+  if (false)
+    {
+      /* There is little point to freeing, as we are about to exit,
+	 and freeing is more likely to cause than cure trouble.  */
+      hash_free (delayed_link_table);
+      delayed_link_table = NULL;
+    }
 }
 
 /* Finish the extraction of an archive.  */