#!/bin/bash
#                       /usr/local/bin/download
# https://crystalfaeries.net/posix/bin/download
# celeste:crystalfaery DOWNLOAD 2017-08-12 22:26:14+00:00
# See Also: /usr/local/bin/downloads for management
# This is a generic website mirroring agent, HOWEVER,
# NOTE: in the wget invocations the REJECT of URL pattern '*showComment=*'
# is intended to NOT download COMMENTS added to BLOG postings,
# as usually the PUBLIC are blithering idiots posting banal drivel,
# if not paid government trolls.

cd -P /home/downloads/ 2> /dev/null || cd -P /home/downloads-/ 2> /dev/null || cd -P $HOME/downloads/ 2> /dev/null || cd -P $HOME/Downloads/ 2> /dev/null || exit 1 # edit this to match your paths

while [[ $# -ne 0 ]]		# We accept a list of URLs
do
	url="${1}"		# work on	the first argument
	shift			# consume	the first argument
	 myurl="$(echo ${url} | sed 's/^https// ; s/^http// ; s/^ftps// ; s/^ftp// ; s/^:\/\/// ; s/^www\.//')"
	domain="$(echo ${url} | sed 's/^https// ; s/^http// ; s/^ftps// ; s/^ftp// ; s/^:\/\/// ; s/^www\.// ; s/\/.*$//')"
	if [ "X$domain" == "X" ]
		then
			echo "${url} is malformed or edit the seds in $0" 1>&2
		else
			mkdir	-p	"${domain}"			# in downloads dir, single dir without www. prefix
			if [ \! -e			www."${domain}" ]
			then
				ln -s	"${domain}"	www."${domain}" # symlinked to, from the www.prefixed name
			fi
			echo  "======= Begin downloading ${url} for `whoami` at `/usr/local/bin/now` ======="	>> "${domain}"/.wget.txt
			# we are trying both with and without www. prefix to guarantee some successful download
			# regardless which way the website "cannonicalizes" their own URL.
			# if they alias both to same successful content,
			# we will be double attempting each file, at twice the "-w" rate.
			# DISABLED NEXT LINE TESTING IF IT'S REALLY NEEDED:
# wget -c -E --follow-ftp -k -m --no-check-certificate -np -nv -p --random-wait -w 30 --unlink --retry-connrefused -R '*showComment=*' www."${myurl}" >> "${domain}"/.wget.txt 2>&1 < /dev/null &
wget -c -E --follow-ftp -k -m --no-check-certificate -np -nv -p --random-wait -w 30 --unlink --retry-connrefused -R '*showComment=*'     "${myurl}" >> "${domain}"/.wget.txt 2>&1 < /dev/null &
			disown %1
#			disown %1 %2
	fi
done
exit	$?
# wget				invoke Web Get
# -c (--continue)			Continue getting a partial-download
# -E (--adjust-extension)		(turn weird file extension into .html
# --follow-ftp			Follow FTP links from HTML documents.
# -H (--span-hosts)		Enable spanning across hosts when doing recursive retrieving.
# -k (--convert-links)		After the download is complete, convert the links in the document to make them suitable for local viewing.
# -l 1 (--level=1)		Specify recursion maximum depth level
# -m (--mirror)			Turn on options suitable for mirroring.
# 				This option turns on recursion and time-stamping, sets infinite recursion depth and
# 				keeps FTP directory listings.  It is currently equivalent to -r -N -l inf --no-remove-listing
# --no-check-certificate		Don't check the server certificate against the available certificate authorities.
# 				Also don't require the URL host name to match the common name presented by the certificate.
# -np (--no-parent)		Do not ever ascend to the parent directory when retrieving recursively. 
# 				This guarantees that only the files below a certain hierarchy will be downloaded.
# -nv (--no-verbose)		Turn off verbose without being completely quiet (use -q for that),
# 				which means that error messages and basic information still get printed.
# -p (--page-requisites)	This option causes Wget to download all the files that are necessary to properly display a given HTML page.
# 				This includes such things as inlined images, sounds, and referenced stylesheets.
# --random-wait			Some web sites may perform log analysis to identify retrieval programs such as Wget
# 				by looking for statistically significant similarities in the time between requests.
# 				This option causes the time between requests to vary between 0.5 and 1.5 * wait seconds,
# 				where wait was specified using the --wait option, in order to mask Wget's presence from such analysis.
# -w 8				Wait 8 seconds
# --unlink			Force Wget to unlink file instead of clobbering existing file.
# 				This option is useful for downloading to the directory with hardlinks.
# --retry-connrefused		Consider "connection refused" a transient error and try again.
# 				Normally Wget gives up on a URL when it is unable to connect to the site
# 				because failure to connect is taken as a sign that the server is not running at all
# 				and that retries would not help.
# 				This option is for mirroring unreliable sites whose servers tend to disappear for short periods of time.
# -R rejlist (--reject rejlist)	'*showComment=*' "${myurl}"
# >>				redirect command output
# "${domain}"/.wget.txt		to a log file in /home/downloads/URL/.wget.txt
# 2>&1				send error output to same logfile
# &				Do it all in the background
# 


syntax highlighted by Code2HTML, v. 0.9.1