Source code for aenet.commandline.aenet_sfp

#!/usr/bin/env python3

import pandas as pd

from .tools import AenetToolABC
from ..trainset import TrnSet

__author__ = "The aenet developers"
__email__ = "aenet@atomistic.net"
__date__ = "2020-11-30"



[docs]
class SFP(AenetToolABC):
    """ Compute structure fingerprints """


[docs]
    def _set_arguments(self):
        self.parser.add_argument(
            "training_set_file",
            help="aenet training set in ASCII, HDF5, or binary format.")

        self.parser.add_argument(
            "-m", "--moment",
            help="Maximal moment for fingerprint expansion (default: 2).",
            type=int,
            default=2)

        self.parser.add_argument(
            "-o", "--output-file",
            help="Path to the CSV output file (default: structures.csv). "
                 "Note: if file exists, data will be appended.",
            type=str,
            default="structures.csv")

        self.parser.add_argument(
            "-t", "--atom-types",
            help="Selected atom types (default: use all).",
            type=str,
            default=None,
            nargs="+")



[docs]
    def _man(self):
        return """
        Featurize atomic structures by calculating structure
        'fingerprints' using the approach of reference [1].

        [1] H. Guo, Q. Wang, A. Urban, N. Artrith, arXiv:2201.11203, 2022,
            https://arxiv.org/abs/2201.11203

        Parses a training-set file produced by ``generate.x`` and
        converted to ASCII format with ``trnset2ascii.x``.  The atomic
        environment features of a structure are combined by calculating
        moments of their distribution (mean, standard deviation, etc.).
        For further details, see the original publication [1].

        The dimension of the resulting structural fingerprints is

           :math:`D = D_{atom} * N_{moment}`

        where :math:`D_{atom}` is the dimension of the atomic
        environment descriptor, and :math:`N_{moment}` is the maximal
        moment used for the fingerprint expansion.

        Note: Usually, the dimension of the structure fingerprints can
        be significanlty reduced using standard dimension reduction
        methods, such as principal component analysis.

        """



[docs]
    def analyze(self, training_set_file, moment, output_file, atom_types):
        ts = TrnSet.from_file(training_set_file)
        print(ts)
        print("Maximal moment for structure fingerprints: {}".format(moment))
        print("Writing structure fingerprints to '{}'.".format(output_file))

        with open(output_file, 'a') as fp:
            s = ts.read_next_structure()
            sfp = s.moment_fingerprint(
                sel_atom_types=atom_types, moment=moment)
            columns = list(range(len(sfp))) + ["num_atoms", "energy", "path"]
            df = pd.DataFrame([sfp + [s.num_atoms, s.energy, s.path]],
                              columns=columns)
            df.to_csv(fp, header=True)
            for i in range(ts.num_structures - 1):
                s = ts.read_next_structure()
                sfp = s.moment_fingerprint(sel_atom_types=atom_types,
                                           moment=moment)
                df = pd.DataFrame([sfp + [s.num_atoms, s.energy, s.path]],
                                  columns=columns)
                df.to_csv(fp, header=False)



[docs]
    def run(self, args):
        self.analyze(args.training_set_file, args.moment,
                     args.output_file, args.atom_types)




if __name__ == "__main__":
    tool = SFP()
    args = tool.parser.parse_args()
    tool.run(args)