Word Frequency
Given n files as command line arguments, calculate the frequency of words for each file, and display the results in a table. Words are defined to be one or more continuous list of letters (see the definition of letters above). Every non-letter character is to be considered whitespace.
The output should be a multi-column list, the first column being the list of words encountered (in lowercase) in any of the input files (sorted according to the C locale), and subsequent columns containing the number of occurrences of that word in file1 ... filen, separated by spaces. Example output for 2 files:
a 5 8 the 6 3 word 2 0 she 3 5
#!/bin/bash
# Word frequencies
# $Id: unix9-wordfreq.sh 191 2006-03-29 11:07:00Z cactus $
# See http://cactus.rulez.org/elte/2005-1-unix/#9 for a description of what it does
# Licensed under the GNU General Public License, version 2
function help () {
self=`basename $0`
cat << EOF
Usage: $self FILE1 [FILE2...]
Creates statistics about the words occuring in the files.
Options:
-help Display this help message
(C) 2005 Dr. ERDI Gergo <cactus@cactus.rulez.org>
Version: \$Id: unix9-wordfreq.sh 191 2006-03-29 11:07:00Z cactus $
EOF
exit 0
}
function error () {
echo ERROR: $@! >&2
exit 1
}
# Ez mashogy mukodik mint a tobbi: itt csak vegignezzuk az opciokat,
# hogy van-e koztuk -help
function options () {
[ -z "$1" ] && return
case "$1" in
-help)
help
;;
*)
[ -f "$1" -a -r "$1" ] || error "$1: Unable to open file"
shift
options "$@"
;;
esac
}
function awk_count () {
AWKPROG='
BEGIN {
FS="[^a-zA-ZáÁéÉíÍóÓöÖõÕúÚüÜûÛ]"
ekezet_lower["Á"] = "á";
ekezet_lower["É"] = "é";
ekezet_lower["Í"] = "í";
ekezet_lower["Ó"] = "ó";
ekezet_lower["Ö"] = "ö";
ekezet_lower["Õ"] = "õ";
ekezet_lower["Ú"] = "ú";
ekezet_lower["Ü"] = "ü";
ekezet_lower["Û"] = "û";
}
function iso88592_tolower (s) {
ret=""
for (j = 1; j <= length(s); j++)
ret = ret iso88592_tolower_c(substr(s, j, 1));
return ret;
}
function iso88592_tolower_c (c) {
if (match (c, "[a-zA-Z]"))
return tolower (c);
else if (c in ekezet_lower)
return ekezet_lower[c];
return c;
}
/[a-zA-ZáÁéÉíÍóÓöÖõÕúÚüÜûÛ]/ {
for (i = 1; i != NF + 1; i++)
{
if (match ($i, "^[a-zA-ZáÁéÉíÍóÓöÖõÕúÚüÜûÛ]+$"))
{
words[iso88592_tolower($i)]++;
}
}
}
END {
for (i in words)
printf "%s %s\n", i, words[i];
}
'
echo "awk '$AWKPROG' '$1' 2>/dev/null|sort"
}
# Ez elemenkenti feldolgozas, oriasi Fothi power :)
function awk_join () {
AWKPROG='
function read1 () {
if (!eof1) {
eof1 = (getline <= 0);
if (eof1)
return;
key1 = $1;
val1 = $2;
width = NF - 1;
for (i = 3; i <= NF; ++i)
val1 = sprintf ("%s %s", val1, $i);
}
}
function read2 () {
if (!eof2) {
eof2 = ((getline < f2) <= 0);
if (eof2)
return;
key2 = $1;
val2 = $2;
}
}
function join () {
printf "%s %s %s\n", key1, val1, val2;
}
function fill_from_1 () {
printf "%s %s 0\n", key1, val1, "0";
}
function fill_from_2 () {
printf "%s ", key2;
for (i = 0; i < width; ++i)
printf "0 ";
printf "%s\n", val2;
}
BEGIN {
FS=" ";
read1();
read2();
while (!eof1 || !eof2) {
if (key1 == key2) {
join();
read1();
read2();
} else if (eof2 || (!eof1 && (key1 < key2))) {
fill_from_1();
read1();
} else {
fill_from_2();
read2();
}
}
exit
}
'
echo "awk -v f2=<($1) '$AWKPROG' 2>/dev/null"
}
[ $# -lt 1 ] && error "Missing arguments"
options "$@"
fullpipe="$(awk_count $1)"
shift
for i in "$@"
do
counter="`awk_count $i`"
awkline="`awk_join "$counter"`"
fullpipe="$fullpipe | $awkline"
done
export LANG=
export LC_ALL=
export LC_CTYPE=
eval "$fullpipe"
转载自:http://gergo.erdi.hu/elte/2005-1-unix/