#!/bin/bash
#
# pat2pdf version 1.01
# Copyright (c)2000 Oren Tirosh <oren@hishome.net>
# Released under the GPL 
#
# This script connects to the USPTO patent database, retrieves the TIFF
# patent images and converts them into a single pdf file using GhostScript.
#
# Yes, I know, you would have written it in perl/python/(insert your 
# favourite scripting language here)
#
# It requires an http fetcher (lynx by default), GhostScript 
# and tiff2ps (part of libtiff)
#
# For best results (small PDFs) use GhostScript 5.5 or higher.
#
#Usage:
# pat2pdf <patnum>
#
# Result is a file in the current directory named pat<patnum>.pdf
#
#Bugs:
#
# Error checking and recovery could be better.
#
#Homepage:
# http://www.tothink.com/pat2pdf
#

# Real name of the patent search site. For some reason the USPTO link to it 
# by IP address.
# DNS mapping disappeared for this name sometime Jan/Feb 2001
#SITENAME="http://pto.dwsearch.com"
# using pto's IP number directly still works
SITENAME="http://164.195.100.11"

# Some utility functions:

# Nonzero if first string contains second string (+globbing chars ?*[x-y])
contains() 
  { [ -n "$1" -a -z "${1##*$2*}" ] 
}

# Change this if you prefer wget, curl, etc.
url2stdout() { 
  lynx -dump -source "$1" 2>/dev/null 
}

# Die with a message
die() { 
  echo "$*">/dev/stderr ; exit 1 
}

# extract a field from a string and echo to stdout
# $1 - source string
# $2 - before target field
# $3 - prefix of target field
# $4 - after target field
extract() {
  [ -n "$1" ] || return 1;
  [ -z "${1##*$2$3*$4*}" ] || return 1;
  STRIP="$3${1##*$2$3}";
  STRIP="${STRIP%%$4*}";
  echo "$STRIP"
}

# verify the presence of a required executable
verify() {
  [ -x $(which $1 ) ] || die "Error: required executable $1 not found" ]
}

#main()

verify "lynx"
verify "tiff2ps"
verify "ps2pdf"
verify "sed"
verify "head"
verify "mv"
verify "rm"

PATNUM=$( echo $1 | sed 's@,@@g' )

[ -z "$PATNUM" ] && die "usage: pat2pdf <patent number>"

[ -z "${PATNUM##[a-zA-Z1-9][a-zA-Z0-9][0-9][0-9]*}" ] || die "Use a 7 digit patent number."

echo "...fetching search results page for patent $PATNUM" > /dev/stderr
RESULTPAGE=$( url2stdout "${SITENAME}/netacgi/nph-Parser?TERM1=${PATNUM}&Sect1=PTO1&Sect2=HITOFF&d=PALL&p=1&u=%2Fnetahtml%2Fsrchnum.htm&r=0&f=S&l=50" ) || die "Error fetching search results web page."

contains "$RESULTPAGE" "No patents have matched" && die "No patents have matched your query." 

contains "$RESULTPAGE" "PAT. NO." || die "Search results page in unexpected format - please notify author."

TITLE=$( extract "$RESULTPAGE" "RS=PN/???????>" "" "</A>" ) || TITLE="##Error isolating patent title - continuing anyway##"
echo "U.S. Patent $PATNUM: $TITLE"

PATENTURL=$( extract "$RESULTPAGE" "HREF=" "/netacgi/nph-Parse" ">" ) || die "Error isolating URL from results page."
PATENTURL="${SITENAME}${PATENTURL}"

echo "...fetching patent page" > /dev/stderr
PATENTPAGE=$( url2stdout "$PATENTURL" | head -50 ) || die "Error fetching patent web page."

IMAGEURL=$( extract "$PATENTPAGE" "a href=" "http://patimg" ">" ) ||  die "Error isolating image page URL from patent page."

IMAGESERVER=$( extract "$IMAGEURL" "" "http://patimg" "/.piw" ) || die "Error isolating image server name."

echo "...fetching images page" > /dev/stderr
IMAGEPAGE=$( url2stdout "$IMAGEURL" ) || die "Error fetching images page."

NUMPAGES=$( extract "$IMAGEPAGE" "-- NumPages=" "" " --" ) || die "Error getting number of pages."

TIFFURL=$( extract "$IMAGEPAGE" "embed src=?" "/.DImg" "? width=" ) || die "Error getting TIFF file URL."
TIFFURL="$IMAGESERVER$TIFFURL"

contains "$TIFFURL" "PageNum=1" || die "Error processing TIFF file URL"
TIFFURL1="${TIFFURL%%PageNum=1*}PageNum="
TIFFURL2="${TIFFURL##*PageNum=1}"

PAGE=1
{
  while [ "$PAGE" -le "$NUMPAGES" ] ; do
    echo "...fetching page $PAGE of $NUMPAGES" > /dev/stderr
    url2stdout "${TIFFURL1}${PAGE}${TIFFURL2}" > "pattmp${PATNUM}.tiff" || die "Error retrieving TIFF page."
    tiff2ps "pattmp${PATNUM}.tiff" 2>/dev/null || die "tiff2ps error"
    PAGE=$[$PAGE+1]
  done
} | { 
  ps2pdf - "tmppat${PATNUM}.pdf" || die "GhostScript error." 
}

rm -f "pattmp${PATNUM}.tiff"

mv -f "tmppat${PATNUM}.pdf" "pat${PATNUM}.pdf" || die "Error renaming pdf file."

echo Done. >/dev/stderr
echo pat${PATNUM}.pdf >/dev/stderr

