AfrikaansTranslators
Attachment 'parseWAITT.pl'
Download 1 #!/usr/bin/perl
2 #
3 # Copyright (c) 2006 Robert Schumann
4 # Licensed under GNU GPL, see http://www.gnu.org/copyleft/gpl.html
5 #
6 #
7 # This program is intended to parse the WAITTtermlys from
8 # http://groups.yahoo.com/group/rekenaarterme
9 # into a form suitable to be posted on a MoinMoin wiki (specifically, on
10 # https://wiki.ubuntu.com/AfrikaansTranslators)
11 #
12 # In order to achieve this, the following text processing must occur:
13 #
14 # 1. Print out unchanged the first few lines of the file, up to a line starting with -
15 # 2. Each line starting with a non-whitespace is the start of a new word definition.
16 # Change it from "A Definition\n" into "[[Anchor(ADefinition)]]\n A Definition:: "
17 # 3. Change {other def} into [#otherdef]
18 # 4. (( and )) replaced by ''', which in MoinMoin is boldface
19 # 5. <(.+)> replaced by italic form, ''<$1>''
20 # 6. Tidy up: replace "(\w)\n\t(\w)" with "$1 $2" and "\n\n\t" into "\n\t1.", which
21 # causes different definitions of the same word to be MoinMoin auto-numbered.
22 # 7. Remove existing numbering: "\s\d\.\s" and "\bI+\b" become "". At the moment there
23 # is no Roman numbering above III
24 # 8. Bring dates onto the same line as the definition they refer to i.e. "\n(1999-12-14)"
25 # becomes " (1999-12-14)"
26 #
27 # TODO by hand, before processing:
28 # * add a newline between "force" and "forseer"
29 # * add a tab in front of "kyk ook {sibling}"
30 # * add a newline between "invoke" and "in werking stel"
31 # * add a newline between "in my humble opinion (IMHO)" and "na my beskeie"
32 # * remove "END" from the last line of the file
33 # * "hang up" only has one bracket on the "(v))", change it to "((v))"
34
35
36 $ARGV[0] or die("Please supply the name of an input dictionary file on the command line");
37
38 open(FILE, $ARGV[0]) || die("can't open $ARGV[0]: $!");
39
40 # Number 1
41 @output[0] = "{{{";
42 while ((my $temp = <FILE>) !~ /^-/) {
43 push @output,$temp;
44 }
45 push @output, "}}}";
46
47 # Read in all definitions, and then split on <newline><wordboundary>
48 my $wholefile = join ('', <FILE>);
49 my @alldefs = split(/\n\b/,$wholefile);
50
51 # Put all definitions into a hash.
52 my %dict = ();
53 foreach $def (@alldefs) {
54 $def =~ /([^\n]+)(\n.*)/s;
55 $dict{$1} = $2;
56 }
57
58 # Now process it all
59 foreach $word (sort keys %dict) {
60 # Number 2
61 $compact = $word;
62 $compact =~ s/[\s\(\)-]/_/g; # this is the internal anchor link regexp
63 $fancyword = join( "",("\n\n[[Anchor(", $compact, ")]]\n ", $word, ":: ") );
64
65 $defstring = $dict{$word};
66 $defstring=~s/\n\t(\(\d+-\d+-\d+\))/\t ''\1''/g; #' Number 8
67 @defs = split(/\n\n\t/,$defstring);
68 foreach $def (@defs) {
69 chomp($def);
70 $def=~s/\n\t/ /g; # Number 6
71 $def=~s/^\s+//; # Number 6
72 $def=~s{\{([^\}]+)\}}
73 {my $tmp = $1; $tmp=~s/[\s\(\)-]/_/g; "[#$tmp]";
74 }gsex; # Number 3; note internal anchor link
75 $def=~s/[\(\)]{2}/'''/g; #' Number 4
76 $def=~s/(<[^>]+>)/''\1''/g; # Number 5
77 $def=~s/\s\d\.\s|\bI+\b/ /g; # Number 7
78 $def = "\t1. $def" if $def; # Number 6
79 }
80 push @output, $fancyword;
81 push @output, join("\n",@defs);
82 }
83
84 close INFO;
85
86 print "@output\n";
Attached Files
To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.You are not allowed to attach a file to this page.