#-------------------------------------------------------------------------------
#  jsondump.pm:
#
#  Module to generate JSON dump
#
#  Transformation options:
#  <afp2web command> -q -c -doc_cold -sp:jsondump.pm -sa:<dump config json> samples\original.pdf
#
#  where (Script Argument) dump config json must look like as given below
#  {
#      ["page":      "on" || "off",]
#      ["text":      "on" || "off",]
#      ["image":     "on" || "off",]
#      ["line":      "on" || "off",]
#      ["container": "on" || "off",]
#      ["vector":    "on" || "off"]
#  }
#
#  - "page", dumps full page text on "text" attribute of page object
#  - "<object type>", dumps specified type objects on "obj" array attribute of page object
#
#  Examples:
#  From command line :
#  On windows : afp2web.exe -q -c -doc_cold -sp:jsondump.pm -sa:"{\"page\": \"on\"}" samples\original.pdf
#  On unix    : ./afp2web.exe -q -c -doc_cold -sp:jsondump.pm -sa:'{"page": "on"}' samples/original.pdf
#
#  From Windows batch file :
#  SET SCRIPTARGS={\"page\": \"on\"}
#  afp2web.exe -q -c -doc_cold -sp:jsondump.pm -sa:"%SCRIPTARGS%" samples\original.pdf
#
#  From Unix shell script file :
#  SCRIPTARGS='{"page": "on"}'
#  ./afp2web   -q -c -doc_cold -sp:jsondump.pm -sa:"$SCRIPTARGS" samples/original.pdf
#
#  Author    : Fa. Maas (AFP2web Team)
#  Copyright : (C) 2019-2020 by Maas Holding GmbH
#
#  $V102    2020-02-18    AFP-930/OXS-10300: Fixed following error when printing json value to file
#                         ---------------------------------
#                         printf fDumpFile ( $jsonStringTmp );
#                         ---------------------------------
#                         Modification of a read-only value attempted at ./jsondump.pm line NNN
#
#  $V101    2019-04-19    a. AFP-812: Extended with 'page' attribute in Script Argument JSON to dump
#                            or not the full page text
#                         b. AFP-812: Extended to return error when no object is requested to dump
#
#  $V100    2019-02-28    Initial Release
#
#-------------------------------------------------------------------------------

#-----------------------------------------------------------------------
# BEGIN block of module
#
# Extends PERL module search path array (@INC) with new element having
# this script modules path in order to have better module portability
#-----------------------------------------------------------------------
BEGIN {
    #---- Fetch script filename
    my $sScriptFilenameTmp = $0;

    #---- Extract script file path from script filename
    my $sScriptFilePathTmp = "";
    if ( $sScriptFilenameTmp =~ /(.*)\/.*\.pm/ ){
        $sScriptFilePathTmp = $1;
    }

    #printf STDERR ( "Script filename: " . $0 . " Script filepath: " . $sScriptFilePathTmp . "\n" );
    if ( $sScriptFilePathTmp eq "" ){
        $sScriptFilePathTmp = ".";
    }
    else {
        my $sScriptFileParentPathTmp = "";
        if ( $sScriptFilePathTmp =~ /(.*)\/sfsamples/ ){
            $sScriptFileParentPathTmp = $1;
        }

        #---- Add script file parent path to module search path
        if ( $sScriptFileParentPathTmp ne "" ){
            unshift( @INC, $sScriptFileParentPathTmp );
        }
    }

    #---- Add script file path to module search path
    unshift( @INC, $sScriptFilePathTmp );
    unshift( @INC, $sScriptFilePathTmp . "/perl/lib" );
    unshift( @INC, $sScriptFilePathTmp . "/perl/site/lib" );
    unshift( @INC, $sScriptFilePathTmp . "/../../../perl/lib" );
    unshift( @INC, $sScriptFilePathTmp . "/../../../perl/site/lib" );
}

use a2w::Config;
use a2w::Container;
use a2w::Document;
use a2w::Font;
use a2w::Image;
use a2w::Index;
use a2w::Kernel;
use a2w::Line;
use a2w::MediumMap;
use a2w::NOP;
use a2w::Overlay;
use a2w::Page;
use a2w::PSEG;
use a2w::Text;
use a2w::Vector;

use a2w::ConfigConstants;
use a2w::DocumentConstants;
use a2w::PageConstants;
use a2w::FontConstants;

use a2w::core::log::Logger;
use JSON::Tiny;

#-----------------------------------------------------------------------
# Initialize once per process
#-----------------------------------------------------------------------
sub initialize(){

    #---- Get Parameter of initialize( Par: a2w::Config, a2w::Kernel )
    ( $a2wConfigPar, $a2wKernelPar ) = @_;

    #---- Define boolean values
    $TRUE  = 1;    # TRUE  boolean value
    $FALSE = 0;    # FALSE boolean value

    #---- Set/Reset Logging
    $bLog = $FALSE;
    if (index( lc($a2wConfigPar->getAttribute( $a2w::ConfigConstants::LOGGINGLEVEL )), "sf") >= 0 ){
        $bLog = $TRUE;
    }

    my $sScriptProcTmp = $a2wConfigPar->getAttribute( $a2w::ConfigConstants::SCRIPTPROCEDURE );
    my $sScriptArgsTmp = $a2wConfigPar->getAttribute( $a2w::ConfigConstants::SCRIPTARGUMENT );
    $sIndexFilePath    = $a2wConfigPar->getAttribute( $a2w::ConfigConstants::INDEXPATH );
    $sOutputFilePath   = $a2wConfigPar->getAttribute( $a2w::ConfigConstants::OUTPUTFILEPATH );
    $sLogPath          = $a2wConfigPar->getAttribute( $a2w::ConfigConstants::LOGPATH );
    $sSpoolFilename    = $a2wKernelPar->getSpoolFilename();

    #---- Instantiate Logger
    $theLogger = undef;
    $bDebugLog = $FALSE;
    if ( $bLog == $TRUE ){
        #---- Set log on log engine
        $theLogger = a2w::core::log::Logger->getSingleton();
        $theLogger->setStartTime( $sStartTime );

        #---- Register modules that has to be logged
        $theLogger->registerClasses( "a2w::Main" );

        #---- Open log file
        my $sLogFilenameTmp = $sSpoolFilename;
        $sLogFilenameTmp =~ s/^.*[\\\/]//;
        $sLogFilenameTmp .= ".jsondump.log";
        $theLogger->setFilename( $sLogFilenameTmp );
        if ( $theLogger->open( $sLogPath ) == $FALSE ){
            return ( -1, "[ERROR] Unable to open log file (" . $sLogPath . $sLogFilenameTmp . ")" );
        }
        $bLog = $theLogger->isRegistered( "a2w::Main" );

        if (index( lc($a2wConfigPar->getAttribute( $a2w::ConfigConstants::LOGGINGLEVEL )), "sfdbg") >= 0 ){
            $theLogger->setLevel( $a2w::core::log::Logger::LEVEL_DEBUG );
            $bDebugLog = $TRUE;
        }

        $theLogger->logFunctionName( "a2w::Main", "initialize()" );
        $theLogger->logMessage( "Running $sScriptProcTmp..." );
        $theLogger->logMessage( "initialize(): Processing " . $sSpoolFilename );
        $theLogger->logMessage( "initialize(): Args: $sScriptArgsTmp, OutputFilePath: $sOutputFilePath" );
    }

    #---- Parse script arguments json
	$hrefDumpConfig = undef;
    if ( $sScriptArgsTmp ne "" ){ $hrefDumpConfig = JSON::Tiny::from_json( $sScriptArgsTmp ); }
    if ( $hrefDumpConfig != undef && $bLog == $TRUE ){
        $theLogger->logMessage( "Dump config:" );
        $theLogger->logHashMessage( $hrefDumpConfig );
    }

    #---- Page process flags
    $APPEND = 0;    # append page to Current Document
    $SKIP   = 1;    # skip page
    $NEWDOC = 2;    # new document

    #---- Text separator
    $sTextSeparator = ' ';

    #---- Flags indicating which objects must be dumped in 'obj' attribute of page object
    $hrefDumpObjects = {
          'page'      => $FALSE
        , 'text'      => $FALSE
        , 'image'     => $FALSE
        , 'line'      => $FALSE
        , 'container' => $FALSE
        , 'vector'    => $FALSE
    };

    #---- Override dump object flags based on dump config
    if ( $hrefDumpConfig != undef ){
        my @arrTypesTmp = keys( %{ $hrefDumpConfig } );
        foreach my $type ( @arrTypesTmp ){
            if ( lc( $hrefDumpConfig->{ $type } ) eq "on" ){ $hrefDumpObjects->{ $type } = $TRUE; }
            elsif ( lc( $hrefDumpConfig->{ $type } ) eq "off" ){ $hrefDumpObjects->{ $type } = $FALSE; }
        }
    }

    #---- Check atleast one of the object type is requested to dump
    if ( !( grep { $_ == $TRUE } values( %{ $hrefDumpObjects } ) ) ){
        # None of the object is requested for dump, so return error
        return ( -2, "None of the object is requested for dump" );
    }

    #---- Initialize dump data object
    $hrefJSONDump = {};
    $sDumpFilename = "";

    return 0;
}

#-----------------------------------------------------------------------
# InitializeDoc for each document
#-----------------------------------------------------------------------
sub initializeDoc(){

    #---- Get Parameter of initializeDoc( Par: a2w::Document )
    ($a2wDocumentPar) = @_;

    if ( $bLog == $TRUE ){
        $theLogger->logFunctionName( "a2w::Main", "initializeDoc()" );
        $theLogger->logMessage( "Name=" . $a2wDocumentPar->getName() . " Id=" . $iDocumentId );
    }

    return 0;
}

#-----------------------------------------------------------------------
# InitializePage for each page
#-----------------------------------------------------------------------
sub initializePage(){

    #---- Get Parameter of initializePage( Par: a2w::Page )
    ($a2wPagePar) = @_;

    if ( $bLog == $TRUE ){ $theLogger->logFunctionName( "a2w::Main", "initializePage()" ); }

    return 0;
}

#-----------------------------------------------------------------------
# Main entry method
# Return values:
#        < 0:    error
#         0:    append page to Current Document
#         1:    skip page
#         2:    first page / new document
#-----------------------------------------------------------------------
sub afp2web(){

    my $iPgIdTmp = $a2wPagePar->getParseId();
    if ( $bLog == $TRUE ){
        $theLogger->logFunctionName( "a2w::Main", "afp2web()" );
        $theLogger->logMessage( "PageId=" . $iPgIdTmp );
    }

    #---- Set default return value
    my $iRetTmp = $SKIP;

    #---- Collect page objects ----#
    my @arrPageObjsTmp = ();

    #---- Collect page texts (NOTE: Text is collected always to make 'text' attribute on page)
    #---- Fetch first text
    my $a2wObjectTmp = $a2wPagePar->getFirstText();

    #---- Add page to dump
    my $hrefJSONPageTmp = {
          'id'   => $iPgIdTmp
        , 'res'  => $a2wPagePar->getResolution()
        , 'w'    => $a2wPagePar->getWidth()
        , 'h'    => $a2wPagePar->getHeight()
    };
    $hrefJSONDump->{ 'pages' }[ ( $iPgIdTmp - 1 ) ] = $hrefJSONPageTmp;

    #---- Collect page texts
    if ( $hrefDumpObjects->{ 'page' } == $TRUE || $hrefDumpObjects->{ 'text' } == $TRUE ){
        #---- Loop thru all the text objects
        while ( $a2wObjectTmp != 0 ){
            $arrPageObjsTmp[ @arrPageObjsTmp ] = {
                  'OBJ'  => $a2wObjectTmp
                , 'TYPE' => 'text'
                , 'XPOS' => $a2wObjectTmp->getXPos()
                , 'YPOS' => $a2wObjectTmp->getYPos()
            };
            $a2wObjectTmp = $a2wPagePar->getNextText(); # get the next text
        } # while ( $a2wObjectTmp != 0 )
    } # if ( $hrefDumpObjects->{ 'page' } == $TRUE || $hrefDumpObjects->{ 'text' } == $TRUE )

    #---- Add texts to page object
    if ( $hrefDumpObjects->{ 'page' } == $TRUE ){
        #---- Get page texts as one string
        my $sPageTextContentTmp = _getPageTexts( \@arrPageObjsTmp );

        #---- Add page texts to page object
        if ( $sPageTextContentTmp ne "" ){ $hrefJSONPageTmp->{ 'text' } = $sPageTextContentTmp; }
    }

    #---- Reset page objects list, if text not to be collected
    if ( $hrefDumpObjects->{ 'text' } == $FALSE ){ @arrPageObjsTmp = (); }

    #---- Collect page images
    if ( $hrefDumpObjects->{ 'image' } == $TRUE ){
        #---- Fetch first image
        $a2wObjectTmp = $a2wPagePar->getFirstImage();

        #---- Loop thru all the image objects
        while ( $a2wObjectTmp != 0 ){
            $arrPageObjsTmp[ @arrPageObjsTmp ] = {
                  'OBJ'  => $a2wObjectTmp
                , 'TYPE' => 'image'
                , 'XPOS' => $a2wObjectTmp->getXPos()
                , 'YPOS' => $a2wObjectTmp->getYPos()
            };
            $a2wObjectTmp = $a2wPagePar->getNextImage(); # get the next image
        } # while ( $a2wObjectTmp != 0 )
    }

    #---- Collect page lines
    if ( $hrefDumpObjects->{ 'line' } == $TRUE ){
        #---- Fetch first line
        $a2wObjectTmp = $a2wPagePar->getFirstLine();

        #---- Loop thru all the line objects
        while ( $a2wObjectTmp != 0 ){
            $arrPageObjsTmp[ @arrPageObjsTmp ] = {
                  'OBJ'  => $a2wObjectTmp
                , 'TYPE' => 'line'
                , 'XPOS' => $a2wObjectTmp->getXPos()
                , 'YPOS' => $a2wObjectTmp->getYPos()
            };
            $a2wObjectTmp = $a2wPagePar->getNextLine(); # get the next line
        } # while ( $a2wObjectTmp != 0 )
    }

    #---- Collect page containers
    if ( $hrefDumpObjects->{ 'container' } == $TRUE ){
        #---- Fetch first container
        $a2wObjectTmp = $a2wPagePar->getFirstContainer();

        #---- Loop thru all the container objects
        while ( $a2wObjectTmp != 0 ){
            $arrPageObjsTmp[ @arrPageObjsTmp ] = {
                  'OBJ'  => $a2wObjectTmp
                , 'TYPE' => 'container'
                , 'XPOS' => $a2wObjectTmp->getXPos()
                , 'YPOS' => $a2wObjectTmp->getYPos()
            };
            $a2wObjectTmp = $a2wPagePar->getNextContainer(); # get the next container
        } # while ( $a2wObjectTmp != 0 )
    }

    #---- Collect page vectors
    if ( $hrefDumpObjects->{ 'vector' } == $TRUE ){
        #---- Fetch first vector
        $a2wObjectTmp = $a2wPagePar->getFirstVector();

        #---- Loop thru all the vector objects
        while ( $a2wObjectTmp != 0 ){
            $arrPageObjsTmp[ @arrPageObjsTmp ] = {
                  'OBJ'  => $a2wObjectTmp
                , 'TYPE' => 'vector'
                , 'XPOS' => $a2wObjectTmp->getXPos()
                , 'YPOS' => $a2wObjectTmp->getYPos()
            };
            $a2wObjectTmp = $a2wPagePar->getNextVector(); # get the next vector
        } # while ( $a2wObjectTmp != 0 )
    }

    #---- Add page objects to dump
    if ( @arrPageObjsTmp > 0 ){
        my $arefObjsTmp = _getPageContent( \@arrPageObjsTmp );
        my @arrJSONPageObjsTmp = @{ $arefObjsTmp };
        if ( @arrJSONPageObjsTmp > 0 ){ $hrefJSONPageTmp->{ 'obj' } = $arefObjsTmp; }
    }

    return $iRetTmp;
}

#-----------------------------------------------------------------------
# FinalizePage for each page
#-----------------------------------------------------------------------
sub finalizePage(){

    if ( $bLog == $TRUE ){ $theLogger->logFunctionName( "a2w::Main", "finalizePage()" ); }

    return 0;
}

#-----------------------------------------------------------------------
# FinalizeDoc for each document
#-----------------------------------------------------------------------
sub finalizeDoc(){

    if ( $bLog == $TRUE ){ $theLogger->logFunctionName( "a2w::Main", "finalizeDoc()" ); }

	#---- Get document filename (which is formatted based on FilenamePattern)
	$sDumpFilename = $a2wDocumentPar->getOutputFilename();
	$sDumpFilename =~ s/\..{3}$//; # remove extension

    # Convert the pageDump object to JSON
    my $jsonStringTmp = JSON::Tiny::encode_json( $hrefJSONDump );

    if ( $jsonStringTmp ne "" ){
        #---- Open dump file
        my $sDumpFilenameTmp = $sOutputFilePath . $sDumpFilename . ".json";
        if ( open( fDumpFile, ">$sDumpFilenameTmp" ) <= 0 ){ return( -1, "Unable to open " . $sDumpFilenameTmp . ", reason: " . $! ); }

        print fDumpFile ( $jsonStringTmp ); # V102 Change: used 'print' command instead of 'printf'

        #---- Close dump file
        close( fDumpFile );
    }

    return 0;
}

#-----------------------------------------------------------------------
# Finalize once per process
#-----------------------------------------------------------------------
sub finalize(){

    if ( $bLog == $TRUE ){ $theLogger->logFunctionName( "a2w::Main", "finalize()" ); }

    return 0;
}

#-----------------------------------------------------------------------
# Sorting Algorithm
#
#    first Key is YPOS, second Key is XPOS
#-----------------------------------------------------------------------
sub complex_arrays{
    $a->{ 'YPOS' } <=> $b->{ 'YPOS' } || $a->{ 'XPOS' } <=> $b->{ 'XPOS' };
}

#-----------------------------------------------------------------------
# Get page texts as one string
#-----------------------------------------------------------------------
sub _getPageTexts(){

    #---- Parameter
    #
    # 1. Array of page objects
    my $arefPageObjsTmp = shift;

    my @arrPgObjsTmp = @{ $arefPageObjsTmp };

    #---- Sort the texts: first key is ypos, second key is xpos
    @arrPgObjsTmp = sort complex_arrays @arrPgObjsTmp;
    if ( @arrPgObjsTmp <= 0 ){ return undef; }

    #---- Build lines based on sorted texts
    my $sLineTmp = "";
    my $sTextContentTmp = "";
    my $iToleranceTmp = 10; # 10 pixels
    my $iCurYPosTmp = @arrPgObjsTmp[ 0 ]->{ 'YPOS' };
    foreach( @arrPgObjsTmp ){
        if ( $_->{ 'YPOS' } > ( $iCurYPosTmp + $iToleranceTmp ) ){
            $iCurYPosTmp = $_->{ 'YPOS' };

            #---- Add line
            $sTextContentTmp .= $sLineTmp . $sTextSeparator;
            $sLineTmp = $_->{ 'OBJ' }->getText();
        }
        else{ # append text to current line
            $sLineTmp .= $_->{ 'OBJ' }->getText();
        }
    }
    if ( $sLineTmp ne "" ){ $sTextContentTmp .= $sLineTmp; }
    
    return $sTextContentTmp;
}

#-----------------------------------------------------------------------
# Get page content as array of objects
#-----------------------------------------------------------------------
sub _getPageContent(){

    #---- Parameter
    #
    # 1. Array of page objects
    my $arefPageObjsTmp = shift;

    my @arrPgObjsTmp = @{ $arefPageObjsTmp };

    #---- Sort the texts: first key is ypos, second key is xpos
    @arrPgObjsTmp = sort complex_arrays @arrPgObjsTmp;
    
    #---- Iterate through objects and make json objects
    my $a2wObjTmp = undef; # a2w core object
    my $a2wFontTmp = undef; # a2w font object
    my $hrefJSONObjTmp = undef;
    my @arrJSONObjectsTmp = ();
    foreach my $obj ( @arrPgObjsTmp ){
        $a2wObjTmp = $obj->{ 'OBJ' };

        #---- Add json object
        $hrefJSONObjTmp = {
              'x'    => $obj->{ 'XPOS' }
            , 'y'    => $obj->{ 'YPOS' }
            , 'type' => $obj->{ 'TYPE' }
        };

        #---- Add json object to list
        $arrJSONObjectsTmp[ @arrJSONObjectsTmp ] = $hrefJSONObjTmp;

        if ( $obj->{ 'TYPE' } eq "text" ){
            #---- Add object type specific attributes
            $hrefJSONObjTmp->{'text'}     = $a2wObjTmp->getText();
            $hrefJSONObjTmp->{'len'}      = $a2wObjTmp->getTextLen();
            $hrefJSONObjTmp->{'angle'}    = $a2wObjTmp->getAngle();
            $hrefJSONObjTmp->{'color'}    = $a2wObjTmp->getColor();
            $hrefJSONObjTmp->{'fontsize'} = $a2wObjTmp->getFontSize() / 10;

            $a2wFontTmp = $a2wObjTmp->getFont();
            if ( $a2wFontTmp != undef ){
                $hrefJSONObjTmp->{'font'} = {
                    'typeface'  => $a2wFontTmp->getTypefaceName(),
                    #'size'      => $a2wFontTmp->getHeight(), # not used  since the value may not be always correct
                    #'width'     => $a2wFontTmp->getWidth(),  # not used since it always returns -1
                    'encoding'  => $a2wFontTmp->getEncoding(),
                    'bold'      => ( $a2wFontTmp->isBold() == $TRUE ) ? "true" : "false",
                    'italic'    => ( $a2wFontTmp->isItalic() == $TRUE ) ? "true" : "false"
                };
            };
        }
        elsif ( $obj->{ 'TYPE' } eq "image" ){
            #---- Add object type specific attributes
            $hrefJSONObjTmp->{'name'}   = $a2wObjTmp->getName();
            $hrefJSONObjTmp->{'width'}  = $a2wObjTmp->getWidth();
            $hrefJSONObjTmp->{'height'} = $a2wObjTmp->getHeight();
            $hrefJSONObjTmp->{'bpp'}    = $a2wObjTmp->getBitsPerPixel();
        }
        elsif ( $obj->{ 'TYPE' } eq "line" ){
            #---- Add object type specific attributes
            $hrefJSONObjTmp->{'width'}      = $a2wObjTmp->getWidth();
            $hrefJSONObjTmp->{'len'}        = $a2wObjTmp->getLength();
            $hrefJSONObjTmp->{'color'}      = $a2wObjTmp->getColor();
            $hrefJSONObjTmp->{'horizontal'} = $a2wObjTmp->isHorizontal();
            $hrefJSONObjTmp->{'vertical'}   = $a2wObjTmp->isVertical();
        }
        elsif ( $obj->{ 'TYPE' } eq "container" ){
            #---- Add object type specific attributes
            $hrefJSONObjTmp->{'name'}   = $a2wObjTmp->getName();
            $hrefJSONObjTmp->{'width'}  = $a2wObjTmp->getWidth();
            $hrefJSONObjTmp->{'height'} = $a2wObjTmp->getHeight();
        }
        elsif ( $obj->{ 'TYPE' } eq "vector" ){
            #---- Add object type specific attributes
            $hrefJSONObjTmp->{'width'}  = $a2wObjTmp->getWidth();
            $hrefJSONObjTmp->{'height'} = $a2wObjTmp->getHeight();
            $hrefJSONObjTmp->{'color'}  = $a2wObjTmp->getColor();
        }
    }

    return \@arrJSONObjectsTmp;
}

__END__
