#!/bin/sh
#
# $Id: zapdups.X,v 1.13 2015-02-08 00:35:41-08 geoff Exp $
#
# Copyright 1993, 1999, 2001, 2005, Geoff Kuenning, Claremont, CA
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#
# 1. Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
# 3. All modifications to the source code must be clearly marked as
#    such.  Binary redistributions based on modified source code
#    must be clearly marked as modified versions in the documentation
#    and/or other materials provided with the distribution.
# 4. The code that causes the 'ispell -v' command to display a prominent
#    link to the official ispell Web site may not be removed.
# 5. The name of Geoff Kuenning may not be used to endorse or promote
#    products derived from this software without specific prior
#    written permission.
#
# THIS SOFTWARE IS PROVIDED BY GEOFF KUENNING AND CONTRIBUTORS ``AS IS'' AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED.  IN NO EVENT SHALL GEOFF KUENNING OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
# SUCH DAMAGE.
#
# Report or get rid of duplicates in various components of a dictionary.
#
# Usage:
#
#	zapdups [-d [-n]] [-l langfile] dict-0 dict-1 ...
#
# Dictionaries starting with dict-1 (not dict-0!) are examined,
# looking for words that appear in any earlier dictionary.  If any
# duplicates are found, they are reported to the standard output.
#
# If the -d switch is specified, duplicates are removed from later
# dictionaries.  The modification is done in-place.  This switch
# should normally be used after examining the output of an earlier
# run.  The -d switch takes a long time to run, because it uses
# munchlist to reduce the dictionary once duplicates are removed.
# The -n switch can be used to suppress the running of munchlist, to
# save time.
#
# If the -l switch is specified, the language tables are gotten from
# the specified file; otherwise they come from $LIBDIR/english.aff.
#
# $Log: zapdups.X,v $
# Revision 1.13  2015-02-08 00:35:41-08  geoff
# Be a bit more paranoid about creating temporary files.
#
# Revision 1.12  2005/04/27 01:18:35  geoff
# Work around idiotic POSIX incompatibilities in sort.  Add secure
# temp-file handling.
#
# Revision 1.11  2005/04/14 14:38:23  geoff
# Update license.  Protect against modernized (i.e., incompatible) and
# internationalized sort commands.  Don't use /usr/tmp.
#
# Revision 1.10  2001/09/06 00:30:29  geoff
# Changes from Eli Zaretskii to support DJGPP compilation.
#
# Revision 1.9  2001/07/25 21:51:47  geoff
# Minor license update.
#
# Revision 1.8  2001/07/23 20:24:04  geoff
# Update the copyright and the license.
#
# Revision 1.7  1999/01/07 01:58:10  geoff
# Update the copyright.
#
# Revision 1.6  1995/01/08  23:23:58  geoff
# Support variable hashfile suffixes for DOS purposes.
#
# Revision 1.5  1994/01/25  07:12:24  geoff
# Get rid of all old RCS log lines in preparation for the 3.1 release.
#
#

#
# The following is necessary so that some internationalized versions of
# sort(1) don't confuse things by sorting into a nonstandard order.
#
LANG=C
LOCALE=C
LC_ALL=C
LC_COLLATE=C
LC_CTYPE=C
export LANG LOCALE LC_COLLATE LC_CTYPE
#
# The following aren't strictly necessary, but I've been made paranoid
# by problems with the stuff above.  It can't hurt to set them to a
# sensible value.
LC_MESSAGES=C
LC_MONETARY=C
LC_NUMERIC=C
LC_TIME=C
export LC_MESSAGES LC_MONETARY LC_NUMERIC LC_TIME
LIBDIR=c:/usr/local/lib
TDIR=${TMPDIR-/tmp}
TEMPDIR=`mktemp -d ${TDIR}/zapdXXXXXXXXXX 2>/dev/null`  ||  (umask 077; mkdir "$TDIR/zapd$$"  ||  (echo "Can't create temp directory: ${TDIR}/zapd$$" 1>&2; exit 1); TEMPDIR="$TDIR/zapd$$")
TMP=${TEMPDIR}/zapd
SORTTMP="-T ${TDIR}"			# !!SORTTMP!!
USAGE="zapdups [-d [-n]] [-l langfile] dict-0 dict-1 ..."

delete=no
munchit=yes
langtabs=${LIBDIR}/english.aff
while :
do
    case "$1" in
	-d)
	    delete=yes
	    shift
	    ;;
	-l)
	    langtabs="$2"
	    shift; shift
	    ;;
	-n)
	    munchit=no
	    shift
	    ;;
	-*)
	    echo "$USAGE" 1>&2
	    exit 1
	    ;;
	*)
	    break
	    ;;
    esac
done

if [ $# -lt 2 ]
then
    echo "$USAGE" 1>&2
    exit 1
fi

FAKEHASH=${TMP}.hash
FAKEDICT=$TMP.b
SEEN=$TMP.c
LATEST=$TMP.d
DUPS=$TMP.e

trap "rm -rf $TEMPDIR; exit 1" 1 2 15
trap "rm -rf $TEMPDIR; exit 0" 13

#
# Create a dummy dictionary to hold a compiled copy of the language
# tables.
#
echo 'QQQQQQQQ' > $FAKEDICT
buildhash -s $FAKEDICT $langtabs $FAKEHASH \
  ||  (echo "Couldn't create fake hash file" 1>&2; rm -rf $TEMPDIR; exit 1) \
  ||  exit 1
rm -f ${FAKEDICT}*

nl='
'
#
# Expand dictionary 0 into a temp file
#
ispell -e -d $FAKEHASH < "$1" \
  | tr ' ' "$nl" \
  | sort $SORTTMP -u \
  | sed -e 's@$@	'"$1@" \
  > $SEEN
shift

#
# For each subsequent dictionary:
#
#	(1) Expand it into a temp file
#	(2) Use join to report the duplicates
#	(3) If we are editing, use comm to remove the duplicates
#	(4) Add the expanded dictionary (sans duplicates) to the list
#	    of words already seen.
#
for dict
do
    ispell -e -d $FAKEHASH < "$dict" \
      | tr ' ' "$nl" \
      | sort $SORTTMP -u \
      | sed -e 's@$@	'"$dict@" \
      > $LATEST
    join '-t	' $SEEN $LATEST > $DUPS
    if [ -s $DUPS ]
    then
	cat $DUPS
	if [ $delete = yes ]
	then
	    sed -e "s@	.*	$dict@	$dict@" $DUPS \
	      | comm -23 $LATEST - \
	      | sed -e "s@	$dict@@" \
	      | if [ $munchit = yes ]
		then
		    munchlist -l "$langtabs" > "$dict"
		else
		    sort $SORTTMP -u -o "$dict"
		fi
	fi
    fi
    # We must do a shift so that $# remains correct
    shift
    if [ $# -gt 0 ]
    then
	sort $SORTTMP -u -o $SEEN $LATEST $SEEN
    fi
done \
  | sort -u
rm -rf $TEMPDIR
