#!/usr/bin/perl -w use strict; #use diagnostics; use XML::Filter::BufferText; # to buffer all character events use XML::SAX::Machines qw(:all); # to pipe the 2 SAX handlers use FindBin qw($Bin); use lib $Bin; use wtr2_base; my $DEBUG=0; init_db(); my $CAN_OUTPUT= 0; my @files= @ARGV || (<$dir{invoices}/*.xml>); foreach my $file (@files) { my $handler= wtr2_handler->new(); my $pipeline = Pipeline( XML::Filter::BufferText->new(), $handler, ); my $data = $pipeline->parse_uri ( $file); my $errors= check_invoice( $data); if( !@$errors) { store_invoice( $data); } else { print "ERROR in $file\n ", join( "\n ", @$errors), "\n"; if( $CAN_OUTPUT) { my $rejected_file= rejected( $file); print "adding errors in $rejected_file\n" if( $DEBUG); add_errors( $pipeline, $data, $errors); # the parser is passed so it can be re-used output_doc_to_check( $rejected_file, $data); } }; } sub check_invoice { my( $data)= @_; my $errors=[]; # array ref, holds the error messages check_buyer( $data->{BuyerPartyIdentifier}, $data->{BuyerOrganisationName}, $errors ); check_po( $data->{OrderIdentifier}, $errors); my @rows= @{$data->{InvoiceRow}}; reset_default_row_id(); foreach my $row( @rows) { # this does not cope well with broken row numbers my $row_id= $row->{RowIdentifier} || default_row_id(); print "checking row $row_id\n" if $DEBUG; check_qtty( $row_id, $row->{DeliveredQuantity}, $row->{DeliveredQuantityUC}, $row->{OrderedQuantity}, $row->{OrderedQuantityUC}, $errors ); } return $errors; } sub store_invoice { my( $xml_data)= @_; print "storing invoice $xml_data->{InvoiceNumber}\n"; # build the various data structures my $data; $data->{invoice} = { number => $xml_data->{InvoiceNumber}, date => $xml_data->{InvoiceDate}, po => $xml_data->{OrderIdentifier}, amount_no_tax => $xml_data->{InvoiceTotalVatExcludedAmount}, tax => $xml_data->{InvoiceTotalVatAmount}, amount => $xml_data->{InvoiceTotalVatIncludedAmount}, payment_status => $xml_data->{PaymentStatusCode}, }; $data->{seller} = { identifier => $xml_data->{SellerPartyIdentifier}, name => $xml_data->{SellerOrganisationName}, tax_code => $xml_data->{SellerOrganisationTaxCode}, }; $data->{address} = { street => $xml_data->{SellerStreetName}, town => $xml_data->{SellerTownName}, zip => $xml_data->{SellerPostCodeIdentifier}, country_code => $xml_data->{CountryCode}, po_box => $xml_data->{SellerPostOfficeBoxIdentifier}, }; $data->{contact} = { name => $xml_data->{SellerContactPersonName}, phone => $xml_data->{SellerPhoneNumberIdentifier}, email => $xml_data->{SellerEmailaddressIdentifier}, }; $data->{invoicerow} ||= []; reset_default_row_id(); foreach my $invoicerow (@{$xml_data->{InvoiceRow}}) { push @{$data->{invoicerow}}, { row_id => $invoicerow->{RowIdentifier} || default_row_id(), sku => $invoicerow->{ArticleIdentifier}, name => $invoicerow->{ArticleName}, qty => $invoicerow->{DeliveredQuantity}, qty_unit => $invoicerow->{DeliveredQuantityUC}, unit_price => $invoicerow->{UnitPriceAmount}, amount_no_tax => $invoicerow->{RowVatExcludedAmount}, tax => $invoicerow->{RowVatAmount}, amount => $invoicerow->{RowAmount}, } } store_all( $data); } package wtr2_handler; use base qw(XML::SAX::Base); use Getopt::Long; # to process actions associated with SAX events # all those could be stored as part of the parser object # but IMHO this implies potential name collision my( $content, $start); BEGIN { # declare which element content and attributes we want to store # note that this works fine because elements are not re-used, the element # name always gives enough information to figure out what to do with it # note that the little languages used in the action part depends on the fact # that - is not a valid characters at the start of an XML identifier # actions for start_element handler # # actions format is: # actions : action (';' action)* # only one used here # action : command options+ # command : ('--create' | '--store_att' ) # options : option+ # option : --in # sub_record must be have been defined # --as # name of the field in the hash # # (defaults to the attribute name) # --parent # trigger only when in proper parent # --create only supports the -as option (not used) $start = { # elements that create a new sub_record for repeated content InvoiceRow => '--create', # attributes to be stored # format is #store_att DeliveredQuantity => '--store_att QuantityUnitCode --in InvoiceRow --as DeliveredQuantityUC', OrderedQuantity => '--store_att QuantityUnitCode --in InvoiceRow --as OrderedQuantityUC', }; # actions for characters handler # actions format is: # actions : action (';' action)* # only one used here # action : command options+ # command : ('--store') # options : option+ # option : --in # sub_record must be have been defined # --as # not used here # --parent # trigger only when in proper parent $content= { # content that needs to be stored once per invoice BuyerPartyIdentifier => '--store', BuyerOrganisationName => '--store', InvoiceNumber => '--store', InvoiceDate => '--store', OrderIdentifier => '--store', InvoiceTotalVatExcludedAmount => '--store', InvoiceTotalVatAmount => '--store', InvoiceTotalVatIncludedAmount => '--store', PaymentStatusCode => '--store', SellerPartyIdentifier => '--store', SellerOrganisationName => '--store', SellerOrganisationTaxCode => '--store', SellerStreetName => '--store', SellerTownName => '--store', SellerPostCodeIdentifier => '--store', CountryCode => '--store --parent SellerPostalAddressDetails', SellerPostOfficeBoxIdentifier => '--store', SellerContactPersonName => '--store', SellerPhoneNumberIdentifier => '--store', SellerEmailaddressIdentifier => '--store', # repeated content RowIdentifier => '--store --in InvoiceRow', ArticleIdentifier => '--store --in InvoiceRow', ArticleName => '--store --in InvoiceRow', DeliveredQuantity => '--store --in InvoiceRow', OrderedQuantity => '--store --in InvoiceRow', UnitPriceAmount => '--store --in InvoiceRow', RowVatExcludedAmount => '--store --in InvoiceRow', RowVatAmount => '--store --in InvoiceRow', RowAmount => '--store --in InvoiceRow', }; } # very complex new! it's a hash so we can add the state data needed sub new { my $class = shift; my $self= bless {}, $class; } # reset data for each invoice sub start_document { $_[0]->{data} = {}; # stored data $_[0]->{context} = []; # element name stack } # sub start_element { my( $p, $elt)= @_; my $name= $elt->{Name}; # store the context push @{$p->{context}}, $name; # process actions for the relevant elements if( my $actions= $start->{$name}) { my @actions= split /;/, $actions; # actions are ; separated (not used here) foreach my $action (@actions) { # use Getopt::Long to parse the action local @ARGV= split /\s+/, $action; my %options; GetOptions( \%options, "create", "store_att=s", "in=s", "as=s", "parent=s"); if( $options{parent} and ($p->{context}->[-2] ne $options{parent})) { next; } # process each type of action if( $options{create}) { # create a new sub_record if( $options{store_att}) { die "can't use --store_att and --create in $name => '$action'\n"; } if( $options{in}) { die "can't use --in with --create in $name => '$action'\n"; } $options{as} ||= $name; # the sub_record name defaults to the element name $p->{data}->{$name} ||= []; push @{$p->{data}->{$name}}, {}; } elsif( my $att= $options{store_att}) { # store an attribute my $att_clarkian= "{}".$att; # attributes are indexed using the clarkian notation my $value= $elt->{Attributes}->{$att_clarkian}->{Value}; store( $p->{data}, $options{in}, $options{as} || $name, $value); } else { die "no valid start action found in $name => '$action'\n"; } } } } sub characters { my( $p, $characters)= @_; my $name= $p->{context}->[-1]; if( my $actions= $content->{$name}) { my @actions= split /;/, $actions; foreach my $action (@actions) { local @ARGV= split /\s+/, $action; my %options; GetOptions( \%options, "store", "in=s", "as=s", "parent=s"); if( $options{parent} and ($p->{context}->[-2] ne $options{parent})) { next; } if( $options{store}) { store( $p->{data}, $options{in}, $options{as} || $name, $characters->{Data}); } else { die "no valid contentt action found in $name => '$action'\n"; } } } } sub end_element { pop @{$_[0]->{context}}; } sub end_document { return $_[0]->{data}; } sub store { my( $data, $in, $as, $value)= @_; if( my $sub_record= $in) { # create it in a sub_record $data->{$sub_record}->[-1]->{$as}= $value; } else { # create at top-level $data->{$as}= $value; } } 1; __END__ =head1 NAME wtr2_sax_base =head1 SYNOPSYS perl wtr2_sax_base =head1 DESCRIPTION This code uses SAX to extract the data from the invoices. It parses the invoice and extract the relevant data into a Perl data structure that is then used to check the invoice and update the data base. The first problem to solve when using SAX is that the content of elements can be broken in different calls to the C handler. So I needed to buffer the content. Luckily enough, Robin Berjon's C does just that! So I used a SAX machine (using L) to pipe the 2 handlers, first L, then my own handler: L. Note that C takes care of C extracts all the information needed to check the invoice, then store it in the data base. The resulting data (returned by the L handler) is then used by L and L. As this is something that is likely to be quite common and as there are few SAX modules that do this, I decided to go generic: I created a small language to describe how to extract the data and store it in my custom data structure. The idea is to give an element name (no namespaces are used in this DTD, so there is no need to get fancy) and associate an action to it. Actions can be associated with the start of an element or with its content. At the start of an element it is possible to store attributes or to create new sub-records, for repeatable data in the document, such as C The content of an element can be stored, either as top-level data, for non-repeatable data, or in a sub-record, for repeatable data. The easiest way I found to parse these actions was to use L Overall this is slightly overkill for this problem, but could be re-used in other cases, so I thought it would be worth it to show it here. In order to know in which element the parser is from the C handler I used a stack of element names: the C handler pushes the current element name on the stack and the C handler pops it. This is the only way to get access to the parent name, needed for the C<--parent> option. Overall the code was quite a pain to write, especially as the default parser, L had a problem during my tests, as once again I had upgraded C but not the Perl module. The hardest part was designing a way to express what I wanted to extract from the XML document and how to store it, without resorting with one of those long lists of Cs that I find make code such a pain to maintain. =head1 AUTHOR Michel Rodriguez =head1 LICENSE This code is Copyright (c) 2003 Michel Rodriguez. All rights reserved. This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. Comments can be sent to mirod@xmltwig.com =head1 SEE ALSO XML::SAX XML::SAX::Machines XML::Filter::BufferText Ways to Rome 2 - Kourallinen Dollareita: http://www.xmltwig.com/article/ways_to_rome_2/