#!/usr/bin/ruby -w
#
# $Id: nldict,v 1.7 2003/06/11 08:39:28 ianmacd Exp $
# 
# Version : 0.9.2
# Author  : Ian Macdonald <ian@caliban.org>
#
# Copyright (C) 2003 Ian Macdonald
#
#   This program is free software; you can redistribute it and/or modify
#   it under the terms of the GNU General Public License as published by
#   the Free Software Foundation; either version 2, or (at your option)
#   any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU General Public License for more details.
#
#   You should have received a copy of the GNU General Public License
#   along with this program; if not, write to the Free Software Foundation,
#   Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.

require 'net/http'

def usage
  dutch = %w(nl_NL nl_BE)

  if dutch.include?(ENV['LANG']) || dutch.include?(ENV['LC_ALL'])
    msg = "Gebruiksaanwijzing: nldict <woord 1> [<woord n>]"
  else
    msg = "Usage: nldict <word 1> [<word n>]"
  end

  $stderr.puts msg
  exit 1
end

# accents we care about
ACCENTS = %w(grave acute circ tilde uml)

# position of grave-accented vowels in ISO-8859-1
VOWELS = { 'A' => 192, 'E' => 200, 'I' => 204, 'O' => 210, 'U' => 217,
	   'a' => 224, 'e' => 232, 'i' => 236, 'o' => 242, 'u' => 249 }

# construct a regular expression to find relevant character entities
entity_regex = "&([" + VOWELS.keys.sort.to_s + "])(" +
  ACCENTS.map {|e| e + "|"}.to_s.chop + ");"
entity_regex = Regexp.new(entity_regex)

usage if ARGV[0].nil?

words = []
threads = []
ARGV.size.times do |idx|
  threads << Thread.new(ARGV[idx]) do |word|

    h = Net::HTTP.start('www.taalweb.nl')
    response, body =
      h.post('/opzoeken/woordenboek/', 'zoekwoord=' + word,
      {'Content-Type' => 'application/x-www-form-urlencoded'})
    body = response.body if body.nil? # need this for Ruby 1.8

    # capture relevant chunk of page
    begin
      words[idx] = /RESULTAAT.*?<\/table>/m.match(body)[0]
    rescue NameError  # no matching word was found
      next
    end
  end
end

# wait for all threads to finish
threads.each {|t| t.join}

words.each do |definition|

  next if definition.nil?

  # nicely lay out the definitions
  definition.gsub!(/<dd><b>/i, "\n")

  # leave a blank line after last connotation
  definition.gsub!(/<big>|<br>\s*<br>/i, "\n")

  # remove remaining HTML tags
  definition.gsub!(/<.*?>/, '')

  # replace numeric entities with their ISO-8859-1 equivalent
  definition.gsub!(/&#(\d\d?\d?);/) {$1.to_i.chr}

  # replace character entities with the equivalent accented letters
  definition.gsub!(entity_regex) do
    vowel_index, entity_index = VOWELS[$1], ACCENTS.index($2)

    if $1 =~ /[eiu]/i && entity_index > ACCENTS.index("tilde")
      # e, i and u have no tilde mapping, so pointer is off by one
      entity_index -= 1
    end

    (vowel_index + entity_index).chr
  end

  print definition
end
