AfrikaansTranslators

Attachment 'parseWAITT.pl'

Download

   1 #!/usr/bin/perl
   2 #
   3 # Copyright (c) 2006 Robert Schumann
   4 # Licensed under GNU GPL, see http://www.gnu.org/copyleft/gpl.html
   5 # 
   6 #
   7 # This program is intended to parse the WAITTtermlys from 
   8 #   http://groups.yahoo.com/group/rekenaarterme
   9 # into a form suitable to be posted on a MoinMoin wiki (specifically, on
  10 #   https://wiki.ubuntu.com/AfrikaansTranslators)
  11 #
  12 # In order to achieve this, the following text processing must occur:
  13 #
  14 # 1. Print out unchanged the first few lines of the file, up to a line starting with -
  15 # 2. Each line starting with a non-whitespace is the start of a new word definition.
  16 #    Change it from "A Definition\n" into "[[Anchor(ADefinition)]]\n A Definition:: "
  17 # 3. Change {other def} into [#otherdef]
  18 # 4. (( and )) replaced by ''', which in MoinMoin is boldface
  19 # 5. <(.+)> replaced by italic form, ''<$1>''
  20 # 6. Tidy up: replace "(\w)\n\t(\w)" with "$1 $2" and "\n\n\t" into "\n\t1.", which
  21 #    causes different definitions of the same word to be MoinMoin auto-numbered.
  22 # 7. Remove existing numbering: "\s\d\.\s" and "\bI+\b" become "".  At the moment there
  23 #    is no Roman numbering above III
  24 # 8. Bring dates onto the same line as the definition they refer to i.e. "\n(1999-12-14)"
  25 #    becomes " (1999-12-14)"
  26 #
  27 # TODO by hand, before processing:
  28 # * add a newline between "force" and "forseer"
  29 # * add a tab in front of "kyk ook {sibling}"
  30 # * add a newline between "invoke" and "in werking stel"
  31 # * add a newline between "in my humble opinion (IMHO)" and "na my beskeie"
  32 # * remove "END" from the last line of the file
  33 # * "hang up" only has one bracket on the "(v))", change it to "((v))"
  34 
  35 
  36 $ARGV[0] or die("Please supply the name of an input dictionary file on the command line");
  37 
  38 open(FILE, $ARGV[0]) || die("can't open $ARGV[0]: $!");
  39 
  40 # Number 1
  41 @output[0] = "{{{";
  42 while ((my $temp = <FILE>) !~ /^-/) {
  43 	push @output,$temp;
  44 }
  45 push @output, "}}}";
  46 
  47 # Read in all definitions, and then split on <newline><wordboundary>
  48 my $wholefile = join ('', <FILE>);
  49 my @alldefs = split(/\n\b/,$wholefile);
  50 
  51 # Put all definitions into a hash.
  52 my %dict = ();
  53 foreach $def (@alldefs) {
  54 	$def =~ /([^\n]+)(\n.*)/s;
  55 	$dict{$1} = $2;
  56 }
  57 
  58 # Now process it all
  59 foreach $word (sort keys %dict) {
  60 	# Number 2
  61 	$compact = $word;
  62 	$compact =~ s/[\s\(\)-]/_/g; # this is the internal anchor link regexp
  63 	$fancyword = join( "",("\n\n[[Anchor(", $compact, ")]]\n ", $word, ":: ") );
  64 
  65 	$defstring = $dict{$word};
  66 	$defstring=~s/\n\t(\(\d+-\d+-\d+\))/\t ''\1''/g; #' Number 8
  67 	@defs = split(/\n\n\t/,$defstring);
  68 	foreach $def (@defs) {
  69 		chomp($def);
  70 		$def=~s/\n\t/ /g; # Number 6
  71 		$def=~s/^\s+//; # Number 6
  72 		$def=~s{\{([^\}]+)\}}
  73 		       {my $tmp = $1; $tmp=~s/[\s\(\)-]/_/g; "[#$tmp]";
  74 		       }gsex; # Number 3; note internal anchor link
  75 		$def=~s/[\(\)]{2}/'''/g; #' Number 4
  76 		$def=~s/(<[^>]+>)/''\1''/g; # Number 5
  77 		$def=~s/\s\d\.\s|\bI+\b/ /g; # Number 7
  78 		$def = "\t1. $def" if $def; # Number 6
  79 	}
  80 	push @output, $fancyword;
  81 	push @output, join("\n",@defs);
  82 }
  83 
  84 close INFO;
  85 
  86 print "@output\n";

Attached Files

To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.
  • [get | view] (2006-02-17 15:25:30, 3.0 KB) [[attachment:parseWAITT.pl]]
 All files | Selected Files: delete move to page

You are not allowed to attach a file to this page.