Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add LIFO queue option for recursive download #1

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions doc/wget.texi
Original file line number Diff line number Diff line change
Expand Up @@ -1916,6 +1916,10 @@ case.
Turn on recursive retrieving. @xref{Recursive Download}, for more
details. The default maximum depth is 5.

@item --queue-type=@var{queuetype}
Specify the queue type (@pxref{Recursive Download}). Accepted values are @samp{fifo} (the default)
and @samp{lifo}.

@item -l @var{depth}
@itemx --level=@var{depth}
Specify recursion maximum depth level @var{depth} (@pxref{Recursive
Expand Down Expand Up @@ -2296,6 +2300,14 @@ documents linked by them, and so on. In other words, Wget first
downloads the documents at depth 1, then those at depth 2, and so on
until the specified maximum depth.

The @dfn{queue type} is FIFO (default) or LIFO. FIFO download (dequeue)
the first enqueued files first. LIFO download the last enqueued files
first. LIFO can prevent that links expire before they're downloaded
because it downloads them directly after their parent page and
therefore directly after the parent page and its temporary links are
generated if it's a dynamic page. Pages sometimes use temporary links
to prevent direct links to files.

The maximum @dfn{depth} to which the retrieval may descend is specified
with the @samp{-l} option. The default maximum depth is five layers.

Expand Down
21 changes: 21 additions & 0 deletions src/init.c
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ CMD_DECLARE (cmd_spec_htmlify);
CMD_DECLARE (cmd_spec_mirror);
CMD_DECLARE (cmd_spec_prefer_family);
CMD_DECLARE (cmd_spec_progress);
CMD_DECLARE (cmd_spec_queue_type);
CMD_DECLARE (cmd_spec_recursive);
CMD_DECLARE (cmd_spec_regex_type);
CMD_DECLARE (cmd_spec_restrict_file_names);
Expand Down Expand Up @@ -247,6 +248,7 @@ static const struct {
{ "proxypasswd", &opt.proxy_passwd, cmd_string }, /* deprecated */
{ "proxypassword", &opt.proxy_passwd, cmd_string },
{ "proxyuser", &opt.proxy_user, cmd_string },
{ "queuetype", &opt.queue_type, cmd_spec_queue_type },
{ "quiet", &opt.quiet, cmd_boolean },
{ "quota", &opt.quota, cmd_bytes_sum },
#ifdef HAVE_SSL
Expand Down Expand Up @@ -403,6 +405,8 @@ defaults (void)
opt.restrict_files_nonascii = false;
opt.restrict_files_case = restrict_no_case_restriction;

opt.queue_type = queue_type_fifo;

opt.regex_type = regex_type_posix;

opt.max_redirect = 20;
Expand Down Expand Up @@ -1441,6 +1445,23 @@ cmd_spec_recursive (const char *com, const char *val, void *place_ignored _GL_UN
return true;
}

/* Validate --queue-type and set the choice. */

static bool
cmd_spec_queue_type (const char *com, const char *val, void *place_ignored _GL_UNUSED)
{
static const struct decode_item choices[] = {
{ "fifo", queue_type_fifo },
{ "lifo", queue_type_lifo },
};
int queue_type = queue_type_fifo;
int ok = decode_string (val, choices, countof (choices), &queue_type);
if (!ok)
fprintf (stderr, _("%s: %s: Invalid value %s.\n"), exec_name, com, quote (val));
opt.queue_type = queue_type;
return ok;
}

/* Validate --regex-type and set the choice. */

static bool
Expand Down
3 changes: 3 additions & 0 deletions src/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,7 @@ static struct cmdline_option option_data[] =
{ "proxy-passwd", 0, OPT_VALUE, "proxypassword", -1 }, /* deprecated */
{ "proxy-password", 0, OPT_VALUE, "proxypassword", -1 },
{ "proxy-user", 0, OPT_VALUE, "proxyuser", -1 },
{ "queue-type", 0, OPT_VALUE, "queuetype", -1 },
{ "quiet", 'q', OPT_BOOLEAN, "quiet", -1 },
{ "quota", 'Q', OPT_VALUE, "quota", -1 },
{ "random-file", 0, OPT_VALUE, "randomfile", -1 },
Expand Down Expand Up @@ -736,6 +737,8 @@ WARC options:\n"),
Recursive download:\n"),
N_("\
-r, --recursive specify recursive download\n"),
N_("\
--queue-type=TYPE queue type (fifo|lifo).\n"),
N_("\
-l, --level=NUMBER maximum recursion depth (inf or 0 for infinite)\n"),
N_("\
Expand Down
4 changes: 4 additions & 0 deletions src/options.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,10 @@ struct options
bool relative_only; /* Follow only relative links. */
bool no_parent; /* Restrict access to the parent
directory. */
enum {
queue_type_fifo,
queue_type_lifo
} queue_type; /* Recursion queue type */
int reclevel; /* Maximum level of recursion */
bool dirstruct; /* Do we build the directory structure
as we go along? */
Expand Down
104 changes: 97 additions & 7 deletions src/recur.c
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,64 @@ as that of the covered work. */
#include "css-url.h"
#include "spider.h"

/* Linked list bubble sort from http://stackoverflow.com/questions/19522121 */

void ll_bubblesort(struct urlpos **pp)
{
// p always points to the head of the list
struct urlpos *p = *pp;
*pp = 0;

while (p)
{
struct urlpos **lhs = &p;
struct urlpos **rhs = &p->next;
bool swapped = false;

// keep going until qq holds the address of a null pointer
while (*rhs)
{
// if the right side is greater than the left side
if ((*rhs)->link_expect_html > (*lhs)->link_expect_html)
{
// swap linked node ptrs, then swap *back* their next ptrs
struct urlpos *tmp = *lhs;
*lhs = *rhs;
*rhs = tmp;
tmp = (*lhs)->next;
(*lhs)->next = (*rhs)->next;
(*rhs)->next = tmp;
lhs = &(*lhs)->next;
swapped = true;
}
else
{ // no swap. advance both pointer-pointers
lhs = rhs;
rhs = &(*rhs)->next;
}
}

// link last node to the sorted segment
*rhs = *pp;

// if we swapped, detach the final node, terminate the list, and continue.
if (swapped)
{
// take the last node off the list and push it into the result.
*pp = *lhs;
*lhs = 0;
}

// otherwise we're done. since no swaps happened the list is sorted.
// set the output parameter and terminate the loop.
else
{
*pp = p;
break;
}
}
}

/* Functions for maintaining the URL queue. */

struct queue_element {
Expand All @@ -62,6 +120,7 @@ struct queue_element {
struct iri *iri; /* sXXXav */
bool css_allowed; /* whether the document is allowed to
be treated as CSS. */
struct queue_element *prev; /* previous element in queue */
struct queue_element *next; /* next element in queue */
};

Expand All @@ -88,9 +147,9 @@ url_queue_delete (struct url_queue *queue)
xfree (queue);
}

/* Enqueue a URL in the queue. The queue is FIFO: the items will be
retrieved ("dequeued") from the queue in the order they were placed
into it. */
/* Enqueue a URL in the queue. The queue is FIFO (LIFO): the items will be
retrieved ("dequeued") from the queue in the (opposite) order they were
placed into it. */

static void
url_enqueue (struct url_queue *queue, struct iri *i,
Expand All @@ -104,6 +163,7 @@ url_enqueue (struct url_queue *queue, struct iri *i,
qel->depth = depth;
qel->html_allowed = html_allowed;
qel->css_allowed = css_allowed;
qel->prev = NULL;
qel->next = NULL;

++queue->count;
Expand All @@ -119,7 +179,11 @@ url_enqueue (struct url_queue *queue, struct iri *i,
i->uri_encoding ? quote_n (1, i->uri_encoding) : "None"));

if (queue->tail)
{
if (opt.queue_type == queue_type_lifo)
qel->prev = queue->tail;
queue->tail->next = qel;
}
queue->tail = qel;

if (!queue->head)
Expand All @@ -134,14 +198,36 @@ url_dequeue (struct url_queue *queue, struct iri **i,
const char **url, const char **referer, int *depth,
bool *html_allowed, bool *css_allowed)
{
struct queue_element *qel = queue->head;
struct queue_element *qel;

switch (opt.queue_type)
{
default:
case queue_type_fifo:
qel = queue->head;
break;
case queue_type_lifo:
qel = queue->tail;
break;
}

if (!qel)
return false;

queue->head = queue->head->next;
if (!queue->head)
queue->tail = NULL;
switch (opt.queue_type)
{
default:
case queue_type_fifo:
queue->head = queue->head->next;
if (!queue->head)
queue->tail = NULL;
break;
case queue_type_lifo:
queue->tail = queue->tail->prev;
if (!queue->tail)
queue->head = NULL;
break;
}

*i = qel->iri;
*url = qel->url;
Expand Down Expand Up @@ -407,6 +493,10 @@ retrieve_tree (struct url *start_url_parsed, struct iri *pi)
if (strip_auth)
referer_url = url_string (url_parsed, URL_AUTH_HIDE);

/* Place html pages on top */
if (opt.queue_type == queue_type_lifo)
ll_bubblesort(&child);

for (; child; child = child->next)
{
if (child->ignore_when_downloading)
Expand Down