Copyright © 2010 Ivan Kanakarakis

This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see

#!/usr/bin/env bash

default directories


default ouput files


fetch 10.000 random wikipedia pages

fetch() {
	echo "Starting fetch process.."
	local src_url=""

	for file in {1..10000}
		elinks -dump -no-numbering -no-references "$src_url" > "$dumppath/$file.raw"

	echo "Fetching done"

clean html, keep only clear text. try to remove any garbage.

tokenize() {
	echo "Processing tokens.."

	cd "$dumppath"
	shopt -s nullglob dotglob
	local files=(*)
	(( ! ${#files[*]} )) && echo "no data to process. try fetch first" && exit 1
	shopt -u nullglob dotglob

	for rawfile in "${files[@]}"
		local tokfile="$tokenpath/${rawfile/raw/tok}"
		sed -r "/.*(\s+Link|IMG|\s+\*|\s+_|^\[|^$).*/d" "$rawfile" > "$tokfile"
		sed -i "s/[^α-ωΑ-Ω]/\n/g" "$tokfile"
		sed -i "/^$/d" "$tokfile"
		sed -i "s/\(.*\)/\L\1/" "$tokfile"

	echo "Tokens ready"

map rank to frequency for all words of all files

sortmap() {
	echo "Mapping started.."

	cd "$tokenpath"
	shopt -s nullglob dotglob
	local files=(*)
	(( ! ${#files[*]} )) && echo "cannot read data. try tokenize first" && exit 1
	shopt -u nullglob dotglob

	sort "${files[@]}" | uniq -ci | sort -nr | pr -n -t > "$mapfile"

	echo "Mapping complete"

calculate the b factor

Zifp's Law:

  Freq = 1 / rank^b  =>  log(Freq) = -b * log(rank)
  => b = - log(Freq) / log(rank) = -log[rank](Freq)
calcb() {
	echo "Calculating b for all words.."

	[ ! -e "$mapfile" ] && echo "cannot read data. try mapping first" && exit 1

	local bval=0 bw=0 nwords=$(wc -l < "$mapfile")
	echo -e "Rank\tFrequency\tPn-Frequency\tb-value\t\tWord" > "$bvalfile"

	while read rank freq word
		PnFreq=$(bc -l <<< "$freq / $nwords")
		(( "$rank" <= 1 )) && bw=0 ||
		bw="$(bc -l <<< "l($PnFreq) / l($rank)")"
		echo -e "$rank\t$freq\t$PnFreq\t$bw\t$word" >> "$bvalfile"
		bval="$(bc -l <<< "$bval + $bw")"
	done < "$mapfile"
	bval="$(bc -l <<< "$bval / $nwords")"
	echo "Final b is $bval" >> "$bvalfile"

	echo "Done calculating"

visualize the results, create a graph plot.

graph() {
	echo "Starting graph creation"

	[ ! -e "$mapfile" ] && \
		echo "cannot read data. try mapping first" && exit 1

	awk '{print $1 " " $2}' "$mapfile" > "$plotdata"
	gnuplot << EOF
set logscale
set xlabel "Rank"
set ylabel "Frequency"
set title "Zipf's Law - Greek language - Wikipedia documents collection"
set term png
set output "$plotimg"
plot "$plotdata"

	echo "Graph created"

the standard usage function.

usage() {
	cat << EOF
usage: $(basename "$0") options

	Options are:
	-a	all, same as -t -m -b -g [Note: no fetch]
	-f	fetch files
	-t	tokenize files
	-m	sort and map tokens - get rank and freq
	-b	calculate b
	-g	create graph plot
	-h	help, print this help message

expect at least an argument

(( !$# )) && usage && exit 1

set up environment

mkdir -p "$work_dir" "$dumppath" "$tokenpath" "$resultpath"

parse arguments

case "$1" in 
	-a) tokenize; sortmap; calcb; graph;;
	-f) fetch;;
	-t) tokenize;;
	-m) sortmap;;
	-b) calcb;;
	-g) graph;;
	-h) usage && exit 0;;
	*) usage && exit 1;;

all good :-)

exit 0