File grotcat.py from the latest check-in


#!/usr/bin/env python3.8
# encoding: utf-8
from __future__ import print_function, division

__doc__="""
Concatenate two or more GROT GPlates rotation files.

"""
__author__ = "Christian Heine; mailto:chhei@paleoearthlabs.org"
__license__ = """
    grotcat.py -- gptools Python scripts for GPlates
    Copyright (C) 2020 Christian Heine, PaleoEarthLabs.org
    
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.
    
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    
    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <https://www.gnu.org/licenses/>.
"""
__version__="2.0"
__copyright__ = "Copyright (c) Christian Heine, 2020."

from gptools import *
import argparse
import sys
from collections import defaultdict

# =========================================================================== #
# --- Set up command line options
# =========================================================================== #

__usage__ = "usage: %prog [options] inputfile"

# parser.add_argument("-f", "--file", dest="filename",
                  # help="write report to FILE", metavar="FILE")

parser = argparse.ArgumentParser(  )
# parser = argparse.ArgumentParser( usage=__usage__, version=__version__ )

parser.add_argument( "-v", "-V", "--verbose", action="store_true", dest="VERBOSE", default=False,
                    help="make lots of noise [false]" )

parser.add_argument( 'grotFiles', type=argparse.FileType('r'), nargs='+' )
# parser.add_argument( "-f", 'outFile', type=argparse.FileType('w'), )

parser.add_argument( "-of", "--outfile",  # action="store_true",
                    metavar="outFileName", dest="outFileName", default="GROT",
                    help="Provide the neame of the output file (no extension)" )

args  = parser.parse_args()

print(args)

# #--- Output format options
# parser.add_argument( "-o", "--outformat",  # action="store_true",
#                     metavar="outputFormat", dest="outformat", default="GROT",
#                     help="Chose from the following output format options: GROT [default], ROT [PLATES rotationfile format]" )
#
#
# parser.add_argument( "-c", "--concatenate",
#                     metavar="ConCat", dest="concatenateGrot", default="ConcatenatedRotationFiles",
#                     help="""Concatenate mode for GROT files. Requires to provide a list of *.grot files
# which should be combined into a single monolithic *.grot mothership file.
#
# This option requires at least 2 input files.  wildcards such as *.grot or *.rot can be used.
# """ )

        #--- OUTPUT FILE NAMING
        # # TODO write routine for automatic output file name generation in different case
        # out = concatenatedBasename + ".grot"
        # outfile = open(out, 'w')
        # err = concatenatedBasename + "_errors.out"
        # errorfile = open( err, 'w' )
        # plateacros = concatenatedBasename + "_plateacronyms.out"
        # acronymfile = open( plateacros, 'w' )
        # entryID = 0
        #
        # for line in rotLines:
        #     entryID += 1
        #     inputRawRotDict[ entryID ] = readROT( line )

def main():
    """Run script on command line """    
        
    fileCount = 0
    rotData = [] #--- list holding dictionaries of headers of MPRS data of input files
    headerData = [] #--- List holding dictionaries of headers of input files
    refData = [] #--- list holding dictionaries of ref sections of input files
    mPIDlist = [] #--- Global list of all moving plate ids
    majorCollisionList = []
    #--- Loop over all input files
    for grotFile in args.grotFiles:
        
        fileCount += 1
        
        print("------------------------------")
        print("-- Reading file", grotFile)
        
        headerDict, movPlateRotSeqDict, referenceDict = grot_reader( grotFile )
        
        #--- Generate a list of all available MPRS plate ids from all files.
        mPIDlist.append( [ *movPlateRotSeqDict.keys() ] )
        
        rotData.append( movPlateRotSeqDict )
        refData.append( referenceDict )
        headerData.append( headerDict )
        
    #--- Basic stats:
    print( len(headerData) )
    # print( mPIDlist )

    #--- Make sure that the MPID are all unique
    for rotFile in mPIDlist:
        if  len(set( rotFile )) != len(rotFile):
            print("Mismatch between unique MPIDs and MPIDs in rotation file! ")
            print( len(set( rotFile )))
            print( len(rotFile) )
    
    #--- Processing rotation data
    print( "="*80, '\n' )
    print(" Processing rotation data\n")

    # mpidSet = set() #--- a set of unique plate ids
    mergedMPRSDict = {}  #--- Final dictionary of the MPRS merged.    
    # TODO Use logging module for messaging
    
    #--- Generate set of intersecting mPIDs
    intersectionPIDs = set( mPIDlist[0] )
    
    for i in mPIDlist[1:]:
        # print(i)
        intersectionPIDs.intersection_update( i )
        
    print( "Intersecting MPIDs:\n", intersectionPIDs )
    print( len(intersectionPIDs) )
    print("-"*80)
    
    #--- Generate a set of mPIDs which are not in either file.
    nonIntersectingPlateIDs = set( mPIDlist[0] )    
    # print( "Non-intersecting MPIDs:", nonIntersectingPlateIDList)
    for i in mPIDlist[1:]:
        
        nonIntersectingPlateIDs.symmetric_difference_update( i )

    print( "Non-intersecting MPIDs:\n", nonIntersectingPlateIDs )
    print( len(nonIntersectingPlateIDs) )
    print("-"*80)
    
    #--- Assemble global mprs dictionary with plateids which are unique to the files
    #--- This does not require any further selection/testing - for now.    
    for inputMPRSdict in rotData:
        
        for mPID in inputMPRSdict.keys():
            if mPID in nonIntersectingPlateIDs:
                print("-non intersecting PID:", mPID )
                mergedMPRSDict[ mPID ] = inputMPRSdict[ mPID ]

    # print("______"*80)
    # print( mergedMPRSDict)
    # print( len(mergedMPRSDict) )
    print("_"*80)
    print("Looping over intersection plateid list")
    print("_"*80)

    #---  Assemble global mprs dictionary with those plateids which are intersecting
    count = 0
    for mPID in intersectionPIDs:
        print("-(-" * 25)
        print(" ==> MPID:", mPID)
        # if mPID == 101:
        #     print("+++++")
        #     print( inputMPRSdict[ mPID ] )
        #     print("+++++")
        #     print("refRotDict")
        #     print(refRotDict[ mPID ] )
        #     sys.exit(0)
            
        #--- Set up entry in global dictionary to fill subsequent info
        mergedMPRSDict[ mPID ] = {}
        # if count == 10:
        #     break
        # count += 1

        #--- Retrieve header info from first MPRS dict in collection
        # print("  --  header and testing")
        refRotDict = rotData[0]
        stageRotList = [ rotData[0][ mPID ]['StageRots'] ]

        for dictElem in ['MPRS:code','MPRS:name', 'PP', 'C']:

            print( refRotDict[mPID][ dictElem ] )
            mergedMPRSDict[ mPID ][ dictElem ]  = refRotDict[ mPID ][ dictElem ]

        #--- Make sure the stage rotation key exists in the merged dict
        mergedMPRSDict[ mPID ]['StageRots'] = []
        
        #--- Now loop over remaining ones, testing whether the data from header is the same
        #--- as in existing one
        
        # print("testing header data against existing")
        for inputMPRSdict in rotData[1:]:
            for dictElem in ['MPRS:code','MPRS:name', 'PP', 'C']:

                if inputMPRSdict[ mPID ][ dictElem] != mergedMPRSDict[ mPID ][ dictElem ]:
                    print(" --| no match ")
                    print(inputMPRSdict[ mPID ][ dictElem], mergedMPRSDict[ mPID ][ dictElem ])

        #--- For stage rotations, we add all available ones to a nested list for comparison
        # for inputMPRSdict in rotData[1:]:
            
            addStageRotData = inputMPRSdict[ mPID ]['StageRots']
            stageRotList.append( addStageRotData )
        
        # print("StageRot list:")
        # print(stageRotList)
        
        #--- Compare stage rotations
        ages = []
        # ages = [ e[0] for rf in stageRotList for e in rf]
        
        #--- Store the index of the element with it for later quick access
        print(" --|-------------------------------------------")
        print(' ==> Evaluating stage rotation age overlap')
        
        for inputMPRSdict in rotData:
            mprsAges = []
            ageDict = {}
            
            for stageAge in inputMPRSdict[mPID]['StageRots']:
                ageIndex = inputMPRSdict[mPID]['StageRots'].index(stageAge)
                ageDict[ stageAge[0] ] = ageIndex
                # print("This is at position", ageIndex)
                # mprsAges.append( stageAge[0] )
                
            # ages.append( mprsAges )
            ages.append( ageDict )
            # print('mprsages\n', mprsAges)
        print(" ==> Agedict:              ", ages)
        
        #--- Generate set of intersection stage rotation ages from dictionaries 
        #--- Difference/non-intersecting stage rotation ages:
        #--- In these cases we need to merge data
        
        intersectingStages = set( ages[0] )
        symDiffStages = set( ages[0] )
        # print(ages)
        for i in ages[1:]:
            # print(i)
            intersectingStages.intersection_update( set(i) )
            symDiffStages.symmetric_difference_update( set(i) )
            
        print(' ==> Intersecting stages:   ', intersectingStages )
        print(' ==> Sym diff stages:       ', symDiffStages )
        print(" --|------------------------------------------")
        
        #--- Process intersecting and non-intersecting stages to assemble merged file
        for stages in stageRotList:
            # print(stages)
            for stage in stages:
                if stage[0] in symDiffStages:
                     mergedMPRSDict[ mPID ]['StageRots'].append( stage )
        
        intersectStageList = []
        for i in intersectingStages:
            
            # print('Merged dict\n', mergedMPRSDict[ mPID ] )
            #--- Test whether the entries are exactly the same
            #--- Loop over the second and larger entries to find the equivalent rotations
            stageRotGroupIndex = 1 #--- temp solution to just provide index to get list
            for stage in stageRotList[1:]:
                accessIndex = stageRotList.index( stage )

                # print("Age", i)
                # print("index of age in agedict", ages[0][ i ]  )
                # print("stage", stage)
                # print("Access index", accessIndex)
                firstStageRotData = stageRotList[ 0 ][ ages[0][i] ]
                stageRotData = stage[ ages[stageRotGroupIndex][ i ] ]
                
                # print("==>")
                # print(firstStageRotData)
                # print(stageRotData)

                stageRotGroupIndex += 1

                #--- TODO assemble list/dictionary with content to chose from
                #--- allow interactive pick or override?
                
                #--- First check for stage rotation parameters to fit
                if firstStageRotData[0:5] == stageRotData[0:5]:
                    # print("match")
                    #--- in this case we simply stick with the first one
                    #-- Check comments
                    #--- Dict comparison
                    if len( firstStageRotData[5] ) + len( stageRotData[5] ) == 0:
                        # print("no stage rot metadata")
                        newStageMetaData = {}
                    else:
                        newStageMetaData = defaultdict(list)
                        
                        #--- TODO - remove doubling up using a set
                        for d in (firstStageRotData[5], stageRotData[5]):
                                for key, value in d.items():
                                    print(key,value)
                                    try:
                                        newStageMetaData[key].append( *value )
                                    except:
                                        newStageMetaData[key] = value 
                                
                        print(" ==> NEW STAGE METADATA:", newStageMetaData)
                        # firstDictLen =  len( firstStageRotData[6] )
                        # dictLen = len( stageRotData[6] )
                    
                    mergedStageData = firstStageRotData[0:5]
                    mergedStageData.append( newStageMetaData )
                    
                    # mergedStageData = tmpStageData
                    print(" ==> Merged stage data: ", mergedStageData)
                    # mergedMPRSDict[ mPID ]['StageRots'].append( mergedStageData )
                    intersectStageList.append( mergedStageData )

                else:
                    
                    #--- if the XOVER occurs at split time
                    try:
                        #--- If there is a perfect match between the rotation pole data we just merge
                        if firstStageRotData[5]['XOVER']:
                            intersectStageList.append( firstStageRotData )
                            intersectStageList.append( stageRotData )
                            # print(intersectStageList)
                        #--- If there is differing precision in the XOVER, we chose the one with higher 
                        #--- precision
                        # else:
                    
                    #--- If there is no XOVER and there is just difference in precision we chose the higher pre
                    #--- to proceed
                    except KeyError:
                    # else:
                        print(" ==> OUCH!! ")
                        print(" Your moving plate rotation sequences contain non-matching rotation poles!")
                        print(" You will need to correct this manually before you can merge files.\n Bailing out!\n")
                        print("XOVER? first stg,", firstStageRotData)
                        print("XOVER?,stg rot   ", stageRotData)
                        # errorDesc = ""
                        majorCollisionList.append( [mPID, ages[0], firstStageRotData[0:5], stageRotData[0:5]] )
                        sys.exit(0)

                # print(intersectStageList)
        
        #--- Finally, make sure that we insert the intersecting mprs stages at the right time:
        #--- Generate list of all stage rotation ages in mprs
        
        print(" --|-----------------------------------------")
        print("  ==> Merging intersecting MPRS ")
        
        # for i in intersectingStages:
        #     print(i)
        # for i in symDiffStages:
        #     print(i)
        
        symDiffStageAges = [ i[0] for i in mergedMPRSDict[ mPID ]['StageRots']]
        symDiffStageAgeIdx = [ mergedMPRSDict[ mPID ]['StageRots'].index(i) for i in mergedMPRSDict[ mPID ]['StageRots']]
        # print(symDiffStageAges, symDiffStageAgeIdx)
        symDiffStageAgeDict = dict( zip(symDiffStageAges, symDiffStageAgeIdx) )
        print(symDiffStageAgeDict)
        
        #--- Find index where to insert intersecting stage rotation ages:
        #--- https://stackoverflow.com/questions/40688545/inserting-value-into-a-list-of-int-just-before-the-rightmost-item-greater-than-t
        #--- TODO IMplement as function
        # rightmost = 0
        # print("INSERTING INTERSECTING AGES")
        # print(mergedMPRSDict[ mPID ]['StageRots'])
        finalStageRotList = []
        
        for intSecAge in intersectStageList:
            
            print("Intersecting entry:", intSecAge)
        
            for i, val in enumerate( mergedMPRSDict[ mPID ]['StageRots'] ):
                # print()
                print(' ==> Enumerating', i, val[0])
                
                if val[0] > intSecAge[0]:
                    # print("insert here:", val,i,intSecAge[0])
                    mergedMPRSDict[ mPID ]['StageRots'].insert(i, intSecAge)
                    break
                # rightmost = i

    #======================================================================#
    #================== OUTPUT 
    #======================================================================#
    print("="*80)
    print("== GENERATING OUTPUT ")
    print("="*80)
    
    #--- Generate a new list of PIDs which is sorted by cardinality for the PLateIDs > 100
    #--- because the Hotspots/abs ref should always come first
    # print(mPIDlist)
    # print(len(mPIDlist))
    
    newMovPIDList = [ *mergedMPRSDict  ]
    
    print(newMovPIDList)
    print(len(newMovPIDList))
    
    absRefPIDs = sorted( i for i in newMovPIDList if i < 100 )
    cardinalPIDList0 = [ i for i in newMovPIDList if i >= 100 ]
    cardinalPIDList1 = list( map(str, cardinalPIDList0) )

    # if VERBOSE:
    # print(cardinalPIDList1)
    cardinalPIDList1.sort()

    newPIDList = absRefPIDs + list( map(int, cardinalPIDList1) )
    # if VERBOSE:
    print("=" * 80)
    print("= PlateID list in shortlex: ")
    print(newPIDList)
    
    # theOutFile = "MergedFiles.grot"
    outfile = open( args.outFileName + '.grot', 'w')
    
    print( headerData )
    
    for headKey,headValue in headerData[0].items():
        
        print(f'{headKey}"{headValue}"')
        outfile.write( f'{headKey}"{headValue}"\n'  )
    
    #--- Put a divider in to visually separate header from body data 
    outfile.write("#" + "-"*79 + "\n")
    
    # for headerData[1]
    # f'{PlateID1:>8}{Age:11.5f}{Lat:10.5f}{Lon:11.5f}{Angle:11.5f}{PlateID2:>8}'
    
    # outfile.write( headerData[1] )
    OUTFILEFORMAT="GROT"
    
    #--- Write out into new rotation file
    for PlateID1 in newPIDList:
    
        print("========================================================")
        print( "===> PlateID1: ", PlateID1 )
        
        if PlateID1 == 999:
            break
            
        plateIdDict = mergedMPRSDict
        stageList = plateIdDict[ PlateID1 ]
        print(plateIdDict[ PlateID1 ])
        
        try:
            # print("try- mplatecode", mprsDict[ PlateID1 ])
            mPlateCode = mergedMPRSDict[ PlateID1 ][ "MPRS:code" ]
        except KeyError:
            # print("except- mplatecode", plateIdDict[ PlateID1 ][ 0 ][ 5 ])
            mPlateCode = mergedMPRSDict[ PlateID1 ][ 0 ][ 5 ][ 2 ]
    
        try:
            mPlateName = mergedMPRSDict[ PlateID1 ][ "MPRS:name" ]
        except KeyError:
            mPlateName = 'FIXME'
        
        #-- FIXME: we're lacking a FPID in here.
        try:
            fPlateCode = mergedMPRSDict[ PlateID1 ][ "FPID:code" ]
        except KeyError:
            fPlateCode = mergedMPRSDict[ PlateID1 ]['StageRots'][ 0 ][ 4 ]
    
        movPlateRotSequence =  '> @MPRS:pid"{0}" @MPRS:code"{1}" @MPRS:name"{2}"\n> @PP"{3}" @C"Sequence generated by grotcat.py"\n'.format(
            PlateID1, mPlateCode, mPlateName, mergedMPRSDict[ PlateID1 ][ "PP" ]) #, datetime.now().isoformat() 
        
        outfile.write( movPlateRotSequence )
    
        # print( movPlateRotSequence )
    
        #--- Loop over individual, sorted stage rotations for a single plateID
        print("  Loop over individual sorted stage rots ")
        for stage in mergedMPRSDict[ PlateID1 ]['StageRots'] : #

            # if VERBOSE:
            # print( "    Stage rotation age:", stage[0] )
            # print( "    Stage:", stage)
            # print( "    ", "-"*60)
        
            #--- Stage rotation attributes including comment dictionary
            # FIXME: This all needs to go back into a class  - example: http://zetcode.com/python/fstring/
            Age = stage[ 0 ]
            Lat = stage[ 1 ]
            Lon = stage[ 2 ]
            Angle = stage[ 3 ]
            PlateID2 = stage[ 4 ]

            stageRotationCommentDict = stage[ 5 ] # dictionary of comment metadata
            # rotationSource = stage[ 6 ]
            # rotationInactive = stage[ 7 ]
        
            #--- Formatting
        
            # pid1out  = str( PlateID1 ).ljust( 8 )     # TODO: adjust for max permissible plateID Length in GPlates
            # ageout   = str( "%.5f" % Age   ).rjust( 10 )
            # latout   = str( "%.5f" % Lat   ).rjust( 9 )
            # lonout   = str( "%.5f" % Lon   ).rjust( 10 )
            # angleout = str( "%.5f" % Angle ).rjust( 10 )
            # pid2out  = str( PlateID2 ).rjust( 8 )
         
            formattedCommentData = format_grot_mprs_comment( stageRotationCommentDict )
            formattedStageRotationData = f'{PlateID1:>8}{Age:11.5f}{Lat:10.5f}{Lon:11.5f}{Angle:11.5f}{PlateID2:>8}'
            
            # print( formattedCommentData )
            # print( formattedStageRotationData )

            #--- Formatting the output
            # if rotationInactive: # if true do not write line out - in PaleoGIS they use double negation: Inactive:true
            #
            #     if OUTFILEFORMAT == "GROT":
            #
            #         formattedStageRotation =  "#" + " ".join( [formattedStageRotationData, formattedCommentData] ) + "\n"
            #
            #     else: # if false write commented line
            #         print( " Skipping line" )
            #         formattedStageRotation =  "999 0.0 0.0 0.0 0.0 999 # " + " ".join( [formattedStageRotationData, formattedCommentData] ) + "\n"
            #
            # else:
            #     if OUTFILEFORMAT == "GROT":
                    
            formattedStageRotation =  " ".join( [formattedStageRotationData, formattedCommentData] ) + "\n"
                #
                # else:
                #     print("  Using PLATES4 syntax")
                #     formattedStageRotation =  " ! ".join( [formattedStageRotationData, formattedCommentData] ) + "\n"
                
            # if VERBOSE:
            #     print( formattedStageRotation )
            
            #---
            #--- Testing for zero-rotations at time > 0 Ma

            if float( Age ) > 0.0 and Lat == 0.0 and Lat == 0.0 and Angle == 0.0:
                if VERBOSE:
                    print("Stage sequence %s | Zero rotation at %s Ma" % (stage, Age) )
            
                zeroRotations.append( "Zero rotation at %.2f Ma | " % float(Age) + formattedStageRotation )
        
                # if VERBOSE:
                #     print( formattedStageRotation )
        
            #--- Skip those future prediction rotation poles and finally write full 
            #--- stage pole + comments to file
            if not float( Age ) < 0.0 :            
                outfile.write( formattedStageRotation )
        
            else:        
                futureRotations.append( "Future rotation skipped | " + formattedStageRotation )
    
    #--- ADD BIBLIOGRAPHY SECTION
    for biblioDict in refData:
        
        outfile.write(  create_grot_bibinfo_section( biblioDict ) )

    outfile.close()
    
    #--- PLATAE ID AND STAGE OVERLAP LOG FILE
    #--- this file collects info to debug the merging.
    
    if len(majorCollisionList) > 0 :
        print("="*80)
        print("MAJOR ISSUES")
        
        with open( args.outFileName + '_issues.txt', 'w') as f:
            f.write("# These PIDS encountered collisions during merge in their stage rotation part\n")    
            f.write("# mPID, \n")    
            for i in majorCollisionList:
                # [mPID, ages[0], firstStageRotData[0:5], stageRotData[0:5]]
                f.write( f"{i}\n" )
                # print()
        # issuefile.close()
    
    print("DONE!")
    pass

if __name__ == "__main__":
    main()

# EOF