#!/bin/bash
# /usr/local/bin/de-www
# http://crystalfaeries.net/posix/bin/de-www
# celeste:crystalfaery 2015-12-23 17:14:11+00:00
# de-www cleans our downloads archive created with our "download" script
# moving all content to directories which do NOT have the "www." prefix,
# and replacing the directory paths which DO have the "www." prefix with symlinks
# WHY?
# some websites have FULL equivalence between {www.,}domain.tld
# some websites INSIST upon forcing accesses to the FQDN www.domain.tld
# some websites ALLOW full access via domain.tld
# we wish to not have duplicates of content in different directories
# we MUST try downloading BOTH {www.,}domain.tld to ENSURE successful download
# we have not YET re-written "download" to intelligently probe a domain
# to resolve this question BEFORE it proceeds to wget content appropriatelty
# THEREFORE we use this to clean up the aftermath, until we intelligently re-write "download"
# NOTE: that our script "downloads" which restarts incomplete downloads performed by "download"
# ALSO is involved in this issue, therefore, ideally we will combine all three scripts into one script.
for d in $(find /home/downloads/ -maxdepth 1 -type d -name 'www.*' | sed 's/\/home\/downloads\/www\.//g')
do # force content into domain.tld, leaving a www.domain.tld symlink to it
mkdir -p /home/downloads/"$d"
rsync -auvzH /home/downloads/www."$d"/{.??,}* \
/home/downloads/"$d"
rm -rf /home/downloads/www."$d"
ln -s /home/downloads/"$d" /home/downloads/www."$d"
done
syntax highlighted by Code2HTML, v. 0.9.1