# striprtf.pl
# desc{  strips RTF codes from an RTF document, not terribly intelligently.  }
#
# Purpose:
#      Use this script to view the unformatted content of an RTF document.
#
# Author:
#      Sean M. Burke, sburke@cpan.org,  http://www.ling.nwu.edu/~sburke/
#
# Usage: 
#      striprtf.pl [inputfile.rtf] [inputfile2.rtf]
# output is on standard output.
#
# Alternate Usage:
#      Feed the RTF in on standard input, as in this UNIX command:
#       cat s*.rtf | striprtf.pl | less

while(<>){
 # Do some extremely basic formatting.
 s/\n//g;
 s/\\par([^d])/\n$1/g;
 s/\\row/\n/g;
 s/\\cell([^x])/\t$1/g;
 s/\\tab/\t/g; 

 # Take care of some character codewords.
 s/\\[lr]quote/\'/g;
 s/\\[lr]dblquote/\"/g;
 s/\\[-~*:]//g;
 s/\\_/-/g;
 s/\\e[nm]dash/-/g;
 s/\\e[nm]space/ /g;
 s/\\bullet/*/g;

 s/\\([\\{}])/"$;".(unpack("H2",$1))/eg;     # hides \{, \}, and \\
 s/\\'/$;/eg;    # hides the \' sequence

 s/\\[-a-z0-9]+( ?)/$1/eg;
 s/[\\{}]/$1/g;

 s/$;([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg;
  # decode things like \'e1 (actually, $;e1, by time we get here)

 print;
}

