my $spider_cmd = "wget" . " --recursive" # follow links (within site) . " --domains $host_name" # don't follow off-site links . " --level=inf" # go deep! . " --timestamping" # probably not useful . " --html-extension" # give resulting HTML files .html ext . " --convert-links" # rewrite links to match new extensions . " --backup-converted" # avoid re-downloading renamed files . " $config->{cms_url}" # URL to start spidering from CMS site . " >spider.log 2>&1"; # Log STDOUT/STDERR in work dir
- Spidering start page: /sitemap-all
- Spider is anonymous - to Drupal