#!/bin/bash
# /home/local/bin/webget
# https://crystalfaeries.net/posix/bin/webget
# celeste:crystalfaery WEBGET 2016-09-02 15:48:00+00:00
# webget is intended to run from a user crontab to maintain a web archive of multiple websites.
# it is presumed there may be non-url structured directories in the archive which cannot be updated via a new wget
# it is? but we are not launching new wgets, merely diddling wget logs... maybe this script was never finished!
# given our newer script "downloads" for tracking downloads, is there anything useful in this script?
# to either cannibalize or to preserve?
# the philosophy here is to preserve records of start and finish of a wget process in the archive,
# whereas the existing downloads script is blowing-away all records intentionally...
# therefore merging code from this into downloads could improve it to maintain some records of use.
# this script assumes simpler .wget.txt file in some ways, but does perhaps usefully generate:
# .wget.started
# .wget.finished
# if we are to implement the intended function of auto-periodic spawning of wget to update,
# the question has always been to properly determine which URL to re-"download".
# the new "download" script is recording that in .wget.txt which "downloads" has blown-away,
# so the existing archive is devoid of useful records to determine that, thus
# we must restart all existing archives manually verifying which URL to download,
# however once new integrated script is involved permanently recording the URL(s),
# webget could maintain them, therefore some cross-polination seems in order,
# webget is obviously the cron-spawned daemon, and "downloads" the manual monitor of it all.
#
# My first take on this all is to update "download" to generate the two extra files at start
# and then update "downloads" to update them when it interacts,
# and then update webget as the daemon to auto-update based on those files.
# NOTE: just because we ever downloaded something does NOT mean we wish to re-auto-update it!
# THUS: downloads useage initiates MARKING a URL to redownload (or multiples)
# which webget then acts on periodically.
# DO? we need to differentiate between forever keep updated, and download retrys vs failure?
# any URL marked for auto-re-update is going to eventually get successfully downloaded,
# but URLs marked for download once only retry until ONE successful completion... AHA!
# downloads now offers a "redownload" command, which should be updated accordingly to mean ONCE
# downloads should also offer a "maintain" command to signify periodic updates.
#
# NOTE: the .wget.txt log is a mess of interleaving of simultaneous wget processes
# the main utility of which is comparing running wget processes versus recorded
# START and FINISHED messages to ascertain "IS a wget ACTIVE?", and could/should
# be processed further to determine WHICH URL actually does work between
# www.URL and just URL, versus which the user originally requested, which server cannonicalizes?
# THE 1st STEP is to begin saving START and FINISHED logs and URLs.
exit -1 # not ready for prime-time
# where do we keep the downloaded websites
cd /home/downloads || cd ~/downloads || exit 1
# process the oldest directory first
for directory in $(ls -trd $(find . -maxdepth 1 -type d)) ; do
if [ -f $directory/.wget.txt ]
then # we may be running a wget already
:
else # create an empty genesis log at minimum
echo "" >> $directory/.wget.txt
echo "STARTED --1970-01-01 23:59:59--" >> $directory/.wget.txt
echo "" >> $directory/.wget.txt
touch -t 197001012359.59 $directory/.wget.txt
fi
# check for a wget log I would generate
if [ -f $directory/.wget.txt ]
then # we have a log
firstline=$(head -n 1 $directory/.wget.txt) # we track the invoking command
echo "STARTED --1970-01-01 23:59:59--" > $directory/.wget.started
grep '^STARTED --' $directory/.wget.txt >> $directory/.wget.started
echo "FINISHED --1970-01-01 23:59:59--" > $directory/.wget.finished
grep '^FINISHED --' $directory/.wget.txt >> $directory/.wget.finished
start=$(tail -n 1 $directory/.wget.started | sed 's/STARTED --//' )
finish=$(tail -n 1 $directory/.wget.finished | sed 's/FINISHED --//')
# update status
echo "" >> $directory/.wget.txt
echo "STARTED --$(/usr/local/bin/now)--" >> $directory/.wget.txt # wget uses local time logging
echo "" >> $directory/.wget.txt
if [[ "$finish" < "$start" ]]
then
echo -n "$directory idle since $finish" >> $directory/.wget.txt
else
echo -n "$directory busy since $start" >> $directory/.wget.txt
fi
# check for an executing wget in this URL tree
ps -ef --forest | grep $directory >> $directory/.wget.txt \
&& echo "I think we are busy." >> $directory/.wget.txt \
|| echo "I think we are idle." >> $directory/.wget.txt
fi
done
syntax highlighted by Code2HTML, v. 0.9.1