#!/usr/bin/perl # Higher order TP Analysis use Getopt::Std; $, = "\t"; $\ = "\n"; # Global variables # %opts, $highest_order, $n, @diff_thresholds # @warnings ([0] -> thresholds, [1] -> repetitions) # %freq, @tp, @tpTotal, and associated hashes my ($oldest_position, $relevant_position, $order, $max_order, @prev); &initialize (); while (<>){ s/\s+//g; if ($_ ne ""){ # Increment Frequency $freq{$_}++; $oldest_position = &get_oldest_position ($n); # Increment TPs $max_order = ($n < $highest_order) ? $n : $highest_order; for ($order=0; $order<$max_order; $order++){ $relevant_position = &get_relevant_position ($oldest_position, $order); ${${$tp[$order]}{$prev[$relevant_position]}}{$_}++; # Newest syllable is first entry ${$tpTotal[$order]}{$prev[$relevant_position]}++; } # Update array of previous syllables $prev[$oldest_position] = $_; $n++; if ($opts{'r'}){ # keep syllables in the order in which they appeared $sylls_in_order{$_} = $n unless ($sylls_in_order{$_}); } } } &print_results (); sub initialize { getopts("ho:aprt:d:",\%opts); if ($opts{'h'} || $opts{'?'}) { usage(); exit (1); } if ($opts{'a'} && $opts{'p'}) { die "I guess that you want SOME output, type tpa -h for available options.\n"; } $highest_order = $opts{'o'} ? $opts{'o'} : 2; if ($opts{'t'}){ @diff_thresholds = split (/\s+/, $opts{'t'}); die "The number of thresholds has to be the same as the highest order.\n" unless (@diff_thresholds == $highest_order); } $n = 0; @warnings = (0, 0); } sub usage { my $usage_string = "Usage: tpa [options] input_file\n" . "Attention, this script assumes that each line contains only one syllable.\n" . "Options:\n" . "-h\tprint this help message\n" . "-o N\tset highest order to N\n" . "-a\tprint only absolute values\n" . "-p\tprint only frequencies\n" . "-t 't1 t2 t3...'\n" . "\ttest whether the maximal difference between TPs of\n". "\ta given order is smaller than the thresholds t1...tn.\n" . "\tDoes not work with option -a.\n" . "-d N\tWarn about repetitions of to Nth order; 1st order repetitions\n" . "\tare adjacent. Does not work with option -a.\n" . "-r\tkeep syllables in the order from the stream"; print $usage_string; } sub get_oldest_position { # Compute the oldest position in the array of previous syllables. For $_[0] < $highest_order, # the array still has to be filled up. Once it has been filled up, the oldest position is the # last one. It takes then $highest_order syllables to be again the last syllable. Thus the formula # $highest_order - ($_[0]) % 4) for array base 1, and $highest_order -1 - ($_[0] % 4) for array base 0 if ($_[0] < $highest_order) { return $highest_order - 1 - $_[0]; } return $highest_order - 1 - ($_[0] % $highest_order); } sub get_relevant_position { # Compute the relevant position in the array of previous syllables to compute TPs of a given order my $oldest_position = $_[0]; my $order = $_[1]; # Starts with 0, corresponding to 1st order TPs !!! my $position = $oldest_position - ($highest_order - 1 - $order); if ($position < 0){ # ($highest_order - 1) + 1 + $position ($position being negative) return $highest_order + $position; } return $position; } sub print_results { my ($order, $s1, $s2, @sylls, @extremes); if ($opts{'r'}){ # keep syllables in the order in which they appeared @sylls = sort {$sylls_in_order{$a} <=> $sylls_in_order{$b}} keys (%sylls_in_order); } else { @sylls = sort (keys %freq); } unless ($opt{'p'}){ print "Frequencies - Absolute Numbers:"; foreach (@sylls){ print $_, $freq{$_}; } print ""; } unless ($opt{'a'}){ print "Frequencies:"; foreach (@sylls){ if ($n > 0){ printf ("%s\t%.3f\n", $_, $freq{$_}/$n); } else { printf ("%s\t%s\n", $_, "NaN"); } } } print "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n"; for ($order=0; $order<$highest_order; $order++){ unless ($opt{'p'}){ printf ("TPs - ORDER %d\n", $order+1); printf ("TPs - Absolute Numbers:\n"); print "", join ("\t", @sylls); foreach $s1 (@sylls){ printf ("%s", $s1); foreach $s2 (@sylls){ if (${${$tp[$order]}{$s1}}{$s2}){ printf ("\t%d", ${${$tp[$order]}{$s1}}{$s2}); } else { printf ("\t%d", 0); } } printf ("\n"); } print ""; } unless ($opt{'a'}){ print "TPs:"; print "", join ("\t", @sylls); foreach $s1 (@sylls){ if ($opts{'t'}){ @extremes = &get_extremes (${$tp[$order]}{$s1}); if (($extremes[1]-$extremes[0]) > ($diff_thresholds[$order]*${$tpTotal[$order]}{$s1})){ print STDERR "WARNING: TP difference exceeding threshold, syllable $s1, order " . ($order + 1); $warnings[0] = 1; } } if ($opts{'d'}){ if ($order < $opts{'d'}){ if (${${$tp[$order]}{$s1}}{$s1}){ print STDERR "WARNING: Syllable $s1 is repeated, order " . ($order +1); $warnings[1] = 1; } } } printf ("%s", $s1); foreach $s2 (@sylls){ if (${$tpTotal[$order]}{$s1} > 0){ if (${${$tp[$order]}{$s1}}{$s2}){ printf ("\t%.3f", ${${$tp[$order]}{$s1}}{$s2}/${$tpTotal[$order]}{$s1}); } else { printf ("\t%.3f", 0); } } else { printf ("\t%s", "NaN"); } } printf ("\n"); } } printf ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n\n"); } if (($opts{'t'}) && ($warnings[0])){ print "WARNING: There were warnings related to unequal TPs."; } if (($opts{'d'}) && ($warnings[1])){ print "WARNING: There were warnings related repetitions."; } } sub get_extremes { my @array = sort (values %{$_[0]}); return ($array[0], $array[-1]); }