#!/usr/bin/perl
require 5;
use strict;    # Time-stamp: "2000-11-24 23:36:13 MST"
# no utf;  # Uncomment under Perl 5.6 or later
=head1 NAME
rtf2xml -- dump an RTF document's structure as XML
=head1 SYNOPSIS
  % cat foo.rtf
  {\stylesheet{\fs20 \snext0 Normal;}}
  % rtf2xml < foo.rtf
  <_><_>Normal;
=head1 DESCRIPTION
This program is a filter that takes an RTF stream on STDIN and emits
its document tree structure as XML on STDOUT.  It assumes that the
input is well-formed RTF.
=head1 COPYRIGHT
Copyright (c) 2000 Sean M. Burke.  All rights reserved.
This program is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.
=head1 AUTHOR
Sean M. Burke C
=cut
binmode(STDIN);
die "$0 takes no command-line parameters.  Provide input on STDIN.\n" if @ARGV;
# For a modular approach to RTF parsing, see RTF::Parser in CPAN,
#  http://search.cpan.org/search?dist=RTF-Parser
my $open_count = 0;
while() {
  while( # Iterate over tokens on each line
   m<
    \G(?:
      ([{}])  # \1
      |
      (?: \\
          ([a-z]+)       # \2: keyword
          (-?\d+)?       # \3: number
          \x20?
      )
      |
      (?: \\ 
        (?:  (?: '([0-9a-fA-F]{2})) # \4: hex escape
              | ([-_|~:*\cm\cj{}])) # \5: magic character
        )
      |
      ([\cm\cj]+) # \6
      |
      ([<>&])     # \7
      |
      ([^\\{}\cm\cj]+)   # \8: unescaped character data (not [<>&])
    )
  >sgx) {
    if( defined $1 ) {
      if($1 eq '{') {
	++$open_count;  print '<_>';
      } else {
        exit if --$open_count < 0;   print '';
      }
    } elsif( defined $2 ) {
      #print "\n";
      if(defined($3)) {
        if($2 ne 'bin') {
          print "<$2 _='$3'/>"; # always -+\d+, so needs no escaping
        } else { # special case: the 'bin' word.
          if($3 < 1) { # sanity
            print "";
          } elsif($3 < (length $_ - pos($_))) {  # all here
            my $b = substr($_,pos($_), $3);
            pos($_) += $3; # skip over that stuff;
            $b =~ s/([^\x21-\x25\x27-\x3B\x3D\x3F-\x7E])/''.(ord($1)).';'/eg;
            print "";
          } else {
            my $b = substr($_,pos($_), $3);
            my $l = $3 - length $b;
            $_ = ''; # clear buffer
            $b =~ s/([^\x21-\x25\x27-\x3B\x3D\x3F-\x7E])/''.(ord($1)).';'/eg;
            print "";
          }
        }
      } else {
        print "<$2/>"; # always a-z, so needs no escaping
      }
    } elsif( defined $4 ) {
      print "$4;";
    } elsif( defined $5 ) {
      # ([-_|~:*\cm\cj{}]) )    # \5: magic character of some sort
      if($5 eq '*')      { print '