rfcstrip

#!/bin/sh
#
# rfcstrip --
#
#       Extract code from RFCs and I-Ds, in text or XML format.
#
# This program is a enhanced version of smistrip.  In addition to
# MIB modules, it recognizes YANG modules and the markers:
#   <CODE BEGINS> file "name-of-file"
#   <CODE ENDS>
#
# OR:
#   <CODE BEGINS> file
#     "very-long-name-of-file"
#   <CODE ENDS>
#
# Also, it handles the xml2rfc v3 XML format (RFC 7991).  In an XML
# file, it recongizes the "sourcecode" element, and extracts into the
# filename given in the "name" attribute".  It also recognizes the
# "artwork" element, and extracts, and extracts into the filename
# given in the "name" attribute".
#
# Handles line unfolding accoring to RFC 8792.
#
# Modified by Martin Bjorklund.
#
# smistrip:
# Copyright (c) 1999 Frank Strauss, Technical University of Braunschweig.
#
# See the file "COPYING" for information on usage and redistribution
# of this file, and for a DISCLAIMER OF ALL WARRANTIES.
#
# NOTE, that this script relies on awk (tested with GNU awk), xsltproc
# and getopts (shell builtin like in bash or standalone).
#
# History:
#   1.3 - 2021-12-13
#       updated unfold detection pattern to the RFC 8792 pattern
#   1.2.1 - 20200-08-24
#       added option to extract only files marked with <CODE BEGINS> from
#       text files
#   1.2 - 2020-03-31
#       added option to extract and unfold artwork from xml files
#   1.1 - 2019-11-08
#       extract from xml (xml2rfc v3) files
#   1.0.1 - 2019-11-08
#       removed debug msg, fixed version printout
#   1.0 - 2019-11-07
#       fix regexp for filename on next line
#   0.6 - 2019-02-22
#       handle -f option properly for YANG files without <CODE BEGINS>
#       handle <CODE BEGINS> file with the filename on next line;
#         needed for long file names
#   0.5 - 2018-10-08
#       fixed bug when a line with a single '}' is not properly extracted
#   0.4 - 2018-08-13
#       fixes #1 - liberal parsing of <CODE BEGINS>
#   0.3 - 2018-05-17
#       handle -f option

VERSION=1.2.1

AWK=awk
GETOPTS=getopts
GREP=grep
XSLTPROC=xsltproc

do_version () {
    echo "rfcstrip $VERSION"
}

do_usage () {
    echo "Usage: rfcstrip [-Vhnac] [-i dir] [-d dir] [-f file]" \
         "file1 [file2 [...]]"
    echo "-V         show version"
    echo "-h         show usage information"
    echo "-n         do not write files"
    echo "-a         extract artwork from XML files"
    echo "-c         only extract files marked CODE BEGINS from txt files"
    echo "           leading and trailing blank lines are removed"
    echo "-i dir     try to read files from directory dir"
    echo "-d dir     write file to directory dir"
    echo "-f file    strip only the specified file"
    echo "file1 ...  input files to parse (RFCs, I-Ds, ...)"
}

do_strip () {
    if [ "$indir" ] ; then
        FILE="$indir/$1"
    else
        FILE="$1"
    fi
    if [ ! -f "$FILE" -a -f "$FILE.gz" ] ; then
        FILE="$FILE.gz"
    fi
    echo "$FILE" | $GREP -q '\.gz$'
    if [ $? -ne 0 ] ; then
        CMD=cat
    else
        CMD=zcat
    fi

    $CMD "$FILE" | head -n 1 | $GREP -q '<\?xml'
    if [ $? -ne 0 ] ; then
        if [ "$artwork" = "1" ]; then
            echo "-a requires an XML file"
            exit 1
        fi
        do_strip_txt $CMD $FILE
    else
        do_strip_xml $CMD $FILE
    fi
}

do_strip_txt() {
    CMD="$1"
    FILE="$2"

    $CMD "$FILE" | \
    tr -d '\015' | \
    $AWK -vtest="$test" -vcodemarker="$codemarker" \
         -vdir="$dir" -vsingle="$single" '

    BEGIN {
        type = 0
        gen_marker_header = 0
    }

    # generic start marker - we are a bit liberal and accept zero or more
    # spaces between the mandatory tokens
    type == 0 && /^[ \t]*<CODE BEGINS>[ \t]*file[ \t]*"(.*)"/ {
        # extract the filename
        match($0, "\"(.*)\"")
        file = substr($0, RSTART+1, RLENGTH-2)
        preskip = 3
        skip = 5
        skipped = -1
        n = 0
        type = 1
        delete line

        next
    }

    # generic start marker, with (hopefully) filename on next line
    type == 0 && /^[ \t]*<CODE BEGINS>[ \t]*file[ \t]*$/ {
        gen_marker_header = 1
        delete line

        next
    }

    type == 0 && gen_marker_header == 1 && /^[ \t]*"(.*)"/ {
        # extract the filename
        match($0, "\"(.*)\"")
        file = substr($0, RSTART+1, RLENGTH-2)
        preskip = 3
        skip = 5
        skipped = -1
        n = 0
        type = 1
        delete line

        next
    }

     # start of SMI module
    type == 0 && codemarker != 1 &&
    /^[ \t]*[A-Za-z0-9-]* *(PIB-)?DEFINITIONS *(::=)? *(BEGIN)? *$/ {
        file = $1
        preskip = 3
        skip = 4
        skipped = -1
        macro = 0
        n = 0
        type = 2
        delete line
    }

    # start of YANG module
    type == 0 && codemarker != 1 &&
    /^[ \t]*(sub)?module +([A-Za-z0-9-]*) *{ *$/ {
        module = $2
        file = module".yang"
        modindent = match($0, "[^ \t]")
        preskip = 3
        skip = 4
        skipped = -1
        n = 0
        type = 3
        delete line
    }

    # process each line in the file
    type != 0 {
        # at the end of a page we go back one line (which is expected to
        # be a separator line), and start the counter skipped to skip the
        # next few lines.
        if ($0 ~ /\[[pP]age [iv0-9]*\] */) {
            for (i = 0 ; i < preskip ; i++) {
                n--
                if (!(line[n] ~ /^[ \t]*$/)) {
                    print "WARNING: the line:: "line[n]":: \
                       should be blank before a page break. It was kept. "
                    n++
                    break
                }
            }
            # some drafts do not use that separator line. so keep it if
            # there are non-blank characters.

            skipped = 0
        }

        # if we are skipping...
        if (skipped >= 0) {
            skipped++

            # if we have skipped enough lines to the top of the next page...
            if (skipped > skip) {
                skipped = -1
            } else {
                # finish skipping if we find a non-empty line, but not before
                # we have skipped three lines.
                if ((skipped > 3) && ($0 ~ /[^ \t]/)) {
                    skipped = -1
                }
            }
        }

        # so, if we are not skipping, remember the line.
        if (skipped == -1) {
            line[n++] = $0
        }
    }

    # remember when we enter a macro definition
    type == 2 && /^[ \t]*[A-Za-z0-9-]* *MACRO *::=/ {
        macro = 1
    }

    # generic end marker
    type == 1 && /^[ \t]*<CODE ENDS>.*$/ {
        if ((length(single) == 0) || (single == file)) {
            n--
            strip = 99
            for (i=0 ; i < n ; i++) {
                # find the minimum column that contains non-blank characters
                # in order to cut a blank prefix off. Ignore lines that only
                # contain white spaces.
                if (!(line[i] ~ /^[ \t]*$/)) {
                    p = match(line[i], "[^ \t]")
                    if ((p < strip) && (length(line[i]) >= p)) { strip = p }
                }
            }
            if (dir) {
                f = dir"/"file
            } else {
                f = file
            }
            # skip empty lines in the beginning
            j = 0
            for (i=0 ; i < n ; i++) {
                if (!(line[i] ~ /^[ \t]*$/)) {
                    break
                }
                j++
            }
            # skip empty lines at the end
            m = n-1
            for (i=n-1 ; i >= 0 ; i--) {
                if (!(line[i] ~ /^[ \t]*$/)) {
                    break
                }
                m--
            }

            if (test != "1") {
                for (i = j ; i <= m ; i++) {
                    print substr(line[i], strip) >f
                }
            }

            print f ": " 1+m-j " lines."
        }
        file = ""
        type = 0
        gen_marker_header = 0
    }

    # end of SMI module
    type == 2 && /^[ \t]*END[ \t]*$/ {
        if (macro == 0) {
            if ((length(single) == 0) || (single == file)) {
                strip = 99
                for (i=0 ; i < n ; i++) {
                    # find the minimum column that contains non-blank characters
                    # in order to cut a blank prefix off. Ignore lines that only
                    # contain white spaces.
                    if (!(line[i] ~ /^[ \t]*$/)) {
                        p = match(line[i], "[^ ]")
                        if ((p < strip) && (length(line[i]) >= p)) { strip = p }
                    }
                }

                if (dir) {
                   f = dir"/"file
                } else {
                   f = file
                }
                if (test != "1") {
                    for (i=0 ; i < n ; i++) {
                        print substr(line[i], strip) >f
                    }
                }

                print file ": " n " lines."
            }
            file = ""
            type = 0
        } else {
            macro = 0
        }
    }

    # end of YANG module
    type == 3 && /^[ \t]*}.*$/ {
        indent = match($0, "[^ \t]")
        if (indent == modindent) {
            modindent = -1
            # we assume that a module is ended with a single "}" with the
            # same indentation level as the module statement.
            if ((length(single) == 0) || (single == file)) {
                strip = 99
                for (i=0 ; i < n ; i++) {
                    # find the minimum column that contains non-blank characters
                    # in order to cut a blank prefix off. Ignore lines that only
                    # contain white spaces.
                    if (!(line[i] ~ /^[ \t]*$/)) {
                        p = match(line[i], "[^ ]")
                        if ((p < strip) && (length(line[i]) >= p)) { strip = p }
                    }
                }

                if (dir) {
                   f = dir"/"file
                } else {
                   f = file
                }
                if (test != "1") {
                    for (i=0 ; i < n ; i++) {
                        print substr(line[i], strip) >f
                    }
                }

                print f ": " n " lines."
            }
            module = ""
            file = ""
            type = 0
        }
    }

    '
}

do_strip_xml() {
    CMD="$1"
    FILE="$2"

    # xsltproc needs the stylesheet as a file
    XSLFILE1=$(mktemp)
    XSLFILE2=$(mktemp)
    trap "rm -f $XSLFILE1 $XSLFILE2" 0 2 3 15

    # the first stylesheet is used to find all sourcecode names
    cat > $XSLFILE1 <<EOF
<xsl:stylesheet version="1.0"
                xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
  <xsl:output method="text"/>
  <xsl:template match="/">
EOF
    if [ "$artwork" = "1" ]; then
        cat >> $XSLFILE1 <<EOF
    <xsl:for-each select="//artwork[@name]">
EOF
    else
        cat >> $XSLFILE1 <<EOF
    <xsl:for-each select="//sourcecode[@name]">
EOF
    fi
    cat >> $XSLFILE1 <<EOF
      <xsl:value-of select="@name"/>
      <xsl:text> </xsl:text>
    </xsl:for-each>
  </xsl:template>
</xsl:stylesheet>
EOF

    if [ -n "$single" ] ; then
        FILES="$single"
    else
        FILES=`$CMD "$FILE" | \
        $XSLTPROC $XSLFILE1 - 2> /dev/null`
    fi

    # the next stylesheet prints the contents of the given sourcecode
    cat > $XSLFILE2 <<EOF
<xsl:stylesheet version="1.0"
                xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
  <xsl:output method="text"/>
  <xsl:param name="FILE"/>
  <xsl:template match="/">
EOF
    if [ "$artwork" = "1" ]; then
        cat >> $XSLFILE2 <<EOF
    <xsl:for-each select="//artwork[@name = \$FILE]">
EOF
    else
        cat >> $XSLFILE2 <<EOF
    <xsl:for-each select="//sourcecode[@name = \$FILE]">
EOF
    fi
    cat >> $XSLFILE2 <<EOF
      <xsl:value-of select="."/>
    </xsl:for-each>
  </xsl:template>
</xsl:stylesheet>
EOF

    for name in $FILES; do
        if [ -n "$dir" ] ; then
            outfile=$dir/$name
        else
            outfile=$name
        fi
        $CMD "$FILE" | \
        $XSLTPROC --stringparam FILE $outfile $XSLFILE2 - 2> /dev/null |
        $AWK -vf="$outfile" -vtest="$test" '

        BEGIN {
            n = 0
            x = 0
            lcnt = 0
            type = 0
            gen_marker_header = 0
            unfold = 0
            prev_backslash = 0
        }

        # remove blank lines at the top
        x == 0 && /^[ \t]*$/ {
            next
        }

        # remove generic start marker if it is present
        x == 0 && /^[ \t]*<CODE BEGINS>[ \t]*file[ \t]*"(.*)"/ {
            type = 1
            next
        }

        # generic start marker, with (hopefully) filename on next line
        x == 0 && /^[ \t]*<CODE BEGINS>[ \t]*file[ \t]*$/ {
            type = 1
            gen_marker_header = 1
            next
        }
        x == 0 && gen_marker_header == 1 && /^[ \t]*"(.*)"/ {
            next
        }

        # detect single backslash strategy
        x == 0 && /NOTE: '\''\\'\'' line wrapping per RFC 8792/ {
            unfold = 1
            next
        }
        # detect double backslash strategy
        x == 0 && /NOTE: '\''\\\\'\'' line wrapping per RFC 8792/ {
            unfold = 2
            next
        }
        # skip empty line after backslash strategy header
        x == 0 && unfold != 0 && /^$/ {
            x = 1
            next
        }
        x == 0 {
            x = 1
        }
        unfold == 1 && prev_backslash == 1 && /^[ \t]+/ {
            sub("^[ \\t]+", "")
        }
        unfold == 1 && /\\$/ {
            if (test != 1) {
                printf substr($0, 0, length($0)-1) >f
            }
            prev_backslash = 1
            next
        }
        unfold == 2 && prev_backslash == 1 && /^[ \t]*\\/ {
            sub("^[ \\t]*\\\\", "")
        }
        unfold == 2 && /\\$/ {
            if (test != 1) {
                printf substr($0, 0, length($0)-1) >f
            }
            prev_backslash = 1
            next
        }
        unfold != 0 {
            prev_backslash = 0
        }
        # generic end marker
        type == 1 && /^[ \t]*<CODE ENDS>.*$/ {
            # skip rest
            type = 2
            next
        }
        type == 2 {
            next
        }

        # empty line, collect it in order to remove empty lines at the end
        /^[ \t]*$/ {
            lcnt++
            lines[lcnt] = $0
            next
        }
        {
            if (lcnt > 0) {
                for (i in lines) {
                    n++
                    if (test != 1) {
                        print lines[i] >f
                    }
                }
                delete lines
                lcnt = 0
            }
            n++
            if (test != 1) {
                print >f
            }
        }
        END {
            print f ": " n " lines."
        }
        '
    done
}

while $GETOPTS Vhnacm:i:d:f: c ; do
    case $c in
        a)      artwork=1
                ;;
        c)      codemarker=1
                ;;
        n)      test=1
                ;;
        f)      single=$OPTARG
                ;;
        i)      indir=$OPTARG
                if [ ! -d $indir -a ! -h $indir ]; then
                    echo "$indir is not a directory"
                    exit 1
                fi
                ;;
        d)      dir=$OPTARG
                if [ ! -d $dir -a ! -h $dir ]; then
                    echo "$dir is not a directory"
                    exit 1
                fi
                ;;
        h)      do_usage
                exit 0
                ;;
        V)      do_version
                exit 0
                ;;
        *)      do_usage
                exit 1
                ;;
    esac
done

shift `expr $OPTIND - 1`

if [ $# -eq 0 ] ; then
    do_usage
    exit 1
else
    for f in "$@" ; do
        do_strip "$f"
    done
fi

exit 0