#!/bin/bash
# /usr/local/bin/download
# https://crystalfaeries.net/posix/bin/download
# celeste:crystalfaery DOWNLOAD 2017-08-12 22:26:14+00:00
# See Also: /usr/local/bin/downloads for management
# This is a generic website mirroring agent, HOWEVER,
# NOTE: in the wget invocations the REJECT of URL pattern '*showComment=*'
# is intended to NOT download COMMENTS added to BLOG postings,
# as usually the PUBLIC are blithering idiots posting banal drivel,
# if not paid government trolls.
cd -P /home/downloads/ 2> /dev/null || cd -P /home/downloads-/ 2> /dev/null || cd -P $HOME/downloads/ 2> /dev/null || cd -P $HOME/Downloads/ 2> /dev/null || exit 1 # edit this to match your paths
while [[ $# -ne 0 ]] # We accept a list of URLs
do
url="${1}" # work on the first argument
shift # consume the first argument
myurl="$(echo ${url} | sed 's/^https// ; s/^http// ; s/^ftps// ; s/^ftp// ; s/^:\/\/// ; s/^www\.//')"
domain="$(echo ${url} | sed 's/^https// ; s/^http// ; s/^ftps// ; s/^ftp// ; s/^:\/\/// ; s/^www\.// ; s/\/.*$//')"
if [ "X$domain" == "X" ]
then
echo "${url} is malformed or edit the seds in $0" 1>&2
else
mkdir -p "${domain}" # in downloads dir, single dir without www. prefix
if [ \! -e www."${domain}" ]
then
ln -s "${domain}" www."${domain}" # symlinked to, from the www.prefixed name
fi
echo "======= Begin downloading ${url} for `whoami` at `/usr/local/bin/now` =======" >> "${domain}"/.wget.txt
# we are trying both with and without www. prefix to guarantee some successful download
# regardless which way the website "cannonicalizes" their own URL.
# if they alias both to same successful content,
# we will be double attempting each file, at twice the "-w" rate.
# DISABLED NEXT LINE TESTING IF IT'S REALLY NEEDED:
# wget -c -E --follow-ftp -k -m --no-check-certificate -np -nv -p --random-wait -w 30 --unlink --retry-connrefused -R '*showComment=*' www."${myurl}" >> "${domain}"/.wget.txt 2>&1 < /dev/null &
wget -c -E --follow-ftp -k -m --no-check-certificate -np -nv -p --random-wait -w 30 --unlink --retry-connrefused -R '*showComment=*' "${myurl}" >> "${domain}"/.wget.txt 2>&1 < /dev/null &
disown %1
# disown %1 %2
fi
done
exit $?
# wget invoke Web Get
# -c (--continue) Continue getting a partial-download
# -E (--adjust-extension) (turn weird file extension into .html
# --follow-ftp Follow FTP links from HTML documents.
# -H (--span-hosts) Enable spanning across hosts when doing recursive retrieving.
# -k (--convert-links) After the download is complete, convert the links in the document to make them suitable for local viewing.
# -l 1 (--level=1) Specify recursion maximum depth level
# -m (--mirror) Turn on options suitable for mirroring.
# This option turns on recursion and time-stamping, sets infinite recursion depth and
# keeps FTP directory listings. It is currently equivalent to -r -N -l inf --no-remove-listing
# --no-check-certificate Don't check the server certificate against the available certificate authorities.
# Also don't require the URL host name to match the common name presented by the certificate.
# -np (--no-parent) Do not ever ascend to the parent directory when retrieving recursively.
# This guarantees that only the files below a certain hierarchy will be downloaded.
# -nv (--no-verbose) Turn off verbose without being completely quiet (use -q for that),
# which means that error messages and basic information still get printed.
# -p (--page-requisites) This option causes Wget to download all the files that are necessary to properly display a given HTML page.
# This includes such things as inlined images, sounds, and referenced stylesheets.
# --random-wait Some web sites may perform log analysis to identify retrieval programs such as Wget
# by looking for statistically significant similarities in the time between requests.
# This option causes the time between requests to vary between 0.5 and 1.5 * wait seconds,
# where wait was specified using the --wait option, in order to mask Wget's presence from such analysis.
# -w 8 Wait 8 seconds
# --unlink Force Wget to unlink file instead of clobbering existing file.
# This option is useful for downloading to the directory with hardlinks.
# --retry-connrefused Consider "connection refused" a transient error and try again.
# Normally Wget gives up on a URL when it is unable to connect to the site
# because failure to connect is taken as a sign that the server is not running at all
# and that retries would not help.
# This option is for mirroring unreliable sites whose servers tend to disappear for short periods of time.
# -R rejlist (--reject rejlist) '*showComment=*' "${myurl}"
# >> redirect command output
# "${domain}"/.wget.txt to a log file in /home/downloads/URL/.wget.txt
# 2>&1 send error output to same logfile
# & Do it all in the background
#
syntax highlighted by Code2HTML, v. 0.9.1