#!/bin/bash
#                      /home/local/bin/webget
# https://crystalfaeries.net/posix/bin/webget
# celeste:crystalfaery WEBGET 2016-09-02 15:48:00+00:00
# webget is intended to run from a user crontab to maintain a web archive of multiple websites.
# it is presumed there may be non-url structured directories in the archive which cannot be updated via a new wget
# it is? but we are not launching new wgets, merely diddling wget logs... maybe this script was never finished!
# given our newer script "downloads" for tracking downloads, is there anything useful in this script?
# to either cannibalize or to preserve?
# the philosophy here is to preserve records of start and finish of a wget process in the archive,
# whereas the existing downloads script is blowing-away all records intentionally...
# therefore merging code from this into  downloads could improve it to maintain some records of use.
# this script assumes simpler .wget.txt file in some ways, but does perhaps usefully generate:
# .wget.started
# .wget.finished
# if we are to implement the intended function of auto-periodic spawning of wget to update,
# the question has always been to properly determine which URL to re-"download".
# the new "download" script is recording that in .wget.txt which "downloads" has blown-away,
# so the existing archive is devoid of useful records to determine that, thus
# we must restart all existing archives manually verifying which URL to download,
# however once new integrated script is involved permanently recording the URL(s),
# webget could maintain them, therefore some cross-polination seems in order,
# webget is obviously the cron-spawned daemon, and "downloads" the manual monitor of it all.
#
# My first take on this all is to update "download" to generate the two extra files at start
# and then update "downloads" to update them when it interacts,
# and then update webget as the daemon to auto-update based on those files.
# NOTE: just because we ever downloaded something does NOT mean we wish to re-auto-update it!
# THUS: downloads useage initiates MARKING a URL to redownload (or multiples)
#       which webget then acts on periodically.
# DO? we need to differentiate between forever keep updated, and download retrys vs failure?
# any URL marked for auto-re-update is going to eventually get successfully downloaded,
# but URLs marked for download once only retry until ONE successful completion... AHA!
# downloads now offers a "redownload" command, which should be updated accordingly to mean ONCE
# downloads should also offer a "maintain" command to signify periodic updates.
#
# NOTE: the .wget.txt log is a mess of interleaving of simultaneous wget processes
# the main utility of which is comparing running wget processes versus recorded
# START and FINISHED messages to ascertain "IS a wget ACTIVE?", and could/should
# be processed further to determine WHICH URL actually does work between
# www.URL and just URL, versus which the user originally requested, which server cannonicalizes?

# THE 1st STEP is to begin saving START and FINISHED logs and URLs.

exit -1	# not ready for prime-time

# where do we keep the downloaded websites
cd /home/downloads || cd ~/downloads || exit 1

# process the oldest directory first
for directory in $(ls -trd $(find . -maxdepth 1 -type d)) ; do

	if [ -f $directory/.wget.txt ]
	then	# we may be running a wget already
		:
	else	# create an empty genesis log at minimum
		echo ""						>> $directory/.wget.txt
		echo "STARTED --1970-01-01 23:59:59--"		>> $directory/.wget.txt
		echo ""						>> $directory/.wget.txt
		touch	-t	197001012359.59			   $directory/.wget.txt
	fi

	# check for a wget log I would generate
	if [ -f $directory/.wget.txt ]
	then	# we have a log
		firstline=$(head -n 1 $directory/.wget.txt)	# we track the invoking command
		echo  "STARTED --1970-01-01 23:59:59--"		>  $directory/.wget.started
		grep '^STARTED --'	$directory/.wget.txt	>> $directory/.wget.started
		echo  "FINISHED --1970-01-01 23:59:59--"	>  $directory/.wget.finished
		grep '^FINISHED --'	$directory/.wget.txt	>> $directory/.wget.finished
		 start=$(tail -n 1 $directory/.wget.started  | sed 's/STARTED --//' )
		finish=$(tail -n 1 $directory/.wget.finished | sed 's/FINISHED --//')
		# update status
		echo ""						>> $directory/.wget.txt
		echo "STARTED --$(/usr/local/bin/now)--"	>> $directory/.wget.txt	# wget uses local time logging
		echo ""						>> $directory/.wget.txt

		if [[ "$finish" < "$start" ]]
		then
			echo -n "$directory idle since $finish"	>> $directory/.wget.txt
		else
			echo -n "$directory busy since $start"	>> $directory/.wget.txt
		fi

		# check for an executing wget in this URL tree
		ps -ef --forest | grep $directory		>> $directory/.wget.txt \
		&& echo "I think we are busy."			>> $directory/.wget.txt \
		|| echo "I think we are idle."			>> $directory/.wget.txt
	fi

done


syntax highlighted by Code2HTML, v. 0.9.1