#!/bin/sh #************************************************************************** #* * #* OCaml * #* * #* Damien Doligez, projet Gallium, INRIA Rocquencourt * #* * #* Copyright 2012 Institut National de Recherche en Informatique et * #* en Automatique. * #* * #* All rights reserved. This file is distributed under the terms of * #* the GNU Lesser General Public License version 2.1, with the * #* special exception on linking described in the file LICENSE. * #* * #************************************************************************** # check-typo - Check typographic conventions on OCaml sources. # This program will check files for the following rules: # - absence of TAB characters (tab) # - absence of non-ASCII characters (non-ascii) # - absence of non-printing ASCII characters (non-printing) # - absence of white space at end of line (white-at-eol) # - absence of empty lines at end of file (white-at-eof) # - presence of a LF character at the end of the file (missing-lf) # - maximum line length of 80 characters (long-line) # - maximum line length of 132 characters (very-long-line) # - presence of a copyright header (missing-header) # - absence of a leftover "$Id" string (svn-keyword) # - formatting of version numbers in @since, @after and @before (doc-version) # Exceptions are handled with git attributes: "typo.*". # Its value for a given file is a comma-separated list of rule names, # which lists the rules that should be disabled for this file. # The rule names are the ones shown above in parentheses. # Built-in exception: # - Any file git identifies as binary # is automatically exempt from all the rules. # ASCII characters are bytes from 0 to 127. Any other byte is # flagged as a non-ASCII character. # For the purpose of this tool, printing ASCII characters are: # - the non-white printable ASCII characters (33 to 126) # - TAB (09) # - LF (10) # - SPC (32) # Anything else is flagged as a non-printing ASCII character. # This program will recursively explore the files and directories given # on the command line (or by default the current directory), and check # every file therein for compliance to the rules. # Directories named .git (and their contents) are always ignored. # This program ignores any file that is not under git control, unless # explicitly given on the command line. # If a directory has the git attribute "typo.prune" then it and its contents are # ignored. # You can ignore a rule by giving the option - on the command # line (before any file names). # Files which include the non-ascii rule will be validated using grep and # line-length computations will take UTF-8 sequences into account. As a special # case, UTF-8 sequences are always allowed in the copyright headers. Files which # include the non-ascii rule must still be valid UTF-8 (the only alternative is # to mark the file as binary). # First prevent i18n from messing up everything. export LC_ALL=C OCAML_CT_CAT=${OCAML_CT_CAT:-cat} OCAML_CT_LS_FILES=${OCAML_CT_LS_FILES:-git ls-files} OCAML_CT_HEAD=${OCAML_CT_HEAD:-HEAD} OCAML_CT_AWK=${OCAML_CT_AWK:-gawk} if [ -z "${OCAML_CT_GIT_INDEX+x}" ] ; then OCAML_CT_GIT_INDEX= else OCAML_CT_GIT_INDEX="GIT_INDEX_FILE=$OCAML_CT_GIT_INDEX" fi # The output of processing the attributes should be whitespace-separated with # - the "typo." prefix dropped # - unset/false keys not present # - set/true keys present # - "may" keys present, suffixed by a question mark # # for example, # typo.long-line: set # typo.missing-header: may # typo.very-long-line: false # should result in "long-line missing-header?" get_attrs() { env $OCAML_CT_GIT_INDEX git check-attr --all $OCAML_CT_CA_FLAG "$1" \ | grep -o " typo\\..*$" | sed "s/ typo\\.//g" \ | grep -v ": unset" | grep -v ": false" \ | sed "s/: set//g" | sed "s/: true//g" | sed "s/: may/?/g" } # empty if the path is *not* pruned check_prune() { env $OCAML_CT_GIT_INDEX git check-attr typo.prune $OCAML_CT_CA_FLAG "$1" \ | grep -v ': unspecified$' | grep -v ': false$' } # Special case for recursive call from the find command (see IGNORE_DIRS). case "$1" in --check-prune) case $2 in .git|.git/*) echo "INFO: pruned path $2 (.git)" >&2 exit 0;; esac if git check-ignore -q "$2"; then exit 0 fi if test -n "$(check_prune "$2")"; then echo "INFO: pruned path $2 (typo.prune)" >&2 exit 0 fi exit 3;; esac case "$1" in --get-attrs) get_attrs "$2" exit 0;; esac usage () { echo "usage: check-typo {-} [--] {}" >&2 exit 2 } check_script () { if [ "$($OCAML_CT_CAT "$OCAML_CT_PREFIX$1" \ | sed -ne '1s/^#!.*/#!/p')" != '#!' ] ; then # These files are listed manually, rather than via gitattributes, # because the list should never expand, and it should not be trivial to # expand (the unix-execvpe test is an ultra-special-case!) f=${1#./} if [ "$f" != "boot/ocamlc" ] && [ "$f" != "boot/ocamllex" ] && \ [ "$f" != "testsuite/tests/lib-unix/unix-execvpe/subdir/script2" ] ; then echo "$1 shouldn't be executable; either:" echo " - Add a #! line" echo " - Run chmod -x $1 (on Unix)" echo " - Run git update-index --chmod=-x $1 (on Windows)" echo "You may wish to check your core.fileMode setting" EXIT_CODE=1 fi fi } userrules='' while : ; do case "$1" in -help|--help) usage;; -*) userrules="${1#-} $userrules"; shift;; --) shift; break;; *) break;; esac done IGNORE_DIRS=" -name .git -prune -o -type d -exec $0 --check-prune {} ; -prune -o " # `-type d`: simple files (not directories) are not pruned during the # "find" invocation but below (look for "check_prune") for performance # reasons: most files outside pruned directories are not pruned, so it # is faster to optimistically run check-typo on them (and maybe get # out in the middle) than to first check then run. TEST_AWK='BEGIN {if ("a{1}" ~ /a{1}$/) exit 1}' if ! $OCAML_CT_AWK "$TEST_AWK" ; then if $OCAML_CT_AWK --re-interval "$TEST_AWK" 2>/dev/null ; then OCAML_CT_AWK="$OCAML_CT_AWK --re-interval" else echo "This script requires interval support in regexes ({m} notation)">&2 echo "Please install a version of awk (e.g. gawk) which supports this">&2 exit 2 fi fi EXIT_CODE=0 ( case $# in 0) find . $IGNORE_DIRS -type f -print;; *) for i in "$@"; do find "$i" $IGNORE_DIRS -type f -print; done;; esac ) | ( while read f; do if test -n "$(check_prune "$f")"; then continue; fi if $(git check-ignore -q "$f"); then continue; fi case `$OCAML_CT_LS_FILES "$f" 2>&1` in "") path_in_index=false;; *) path_in_index=true;; esac case "$*" in *$f*) is_cmd_line=true;; *) is_cmd_line=false;; esac if $path_in_index || $is_cmd_line; then :; else continue; fi if [ -z "$OCAML_CT_PREFIX" ] ; then if [ -x "$f" ] ; then check_script "$f" fi else if git ls-files -s "$f" | grep -q "^100755" ; then check_script "$f" fi fi attr_rules='' if $path_in_index; then # Below is a git plumbing command to detect whether git regards a # particular file as binary. This takes into account .gitattributes, but # also works if the file has been automatically detected as binary by git. # EMPTY is the hash of the empty tree (which is specially known to git - # it is automatically included in every repository) as a way to get # `diff-tree` to print the whole tree state; its `--numstat` output then # prints a summary where two dashes in the first two columns indicates a # binary file. # (See https://git-scm.com/docs/git-diff-tree#_other_diff_formats and # the documentation for the --numstat option. Commands designated as # "plumbing" commands in git have stable output intended for parsing) EMPTY=`git hash-object -t tree /dev/null` git diff-tree --numstat $EMPTY $OCAML_CT_HEAD -- "$f" \ | grep -q "^-[[:blank:]]-" && continue attr_rules=$(get_attrs "$f") fi rules="$userrules" # remove newlines, ensure spaces at boundary rules=" $(echo $rules) " attr_rules=" $(echo $attr_rules) " # grep -a is used to force the file to be considered as text and -x # requires the entire line to match. This specifically detects the # presence of lines containing malformed UTF-8. It may be tested using # https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt if $OCAML_CT_CAT "$OCAML_CT_PREFIX$f" \ | LC_ALL=C.UTF-8 grep -qaxv '.*' ; then echo "File \"$f\" is not correctly encoded in UTF-8" exit 2 fi if ! \ ($OCAML_CT_CAT "$OCAML_CT_PREFIX$f" | tr -d '\r'; echo) \ | $OCAML_CT_AWK -v rules="$rules" -v attr_rules="$attr_rules" -v file="$f" \ ' function is_err(name) { return ((rules attr_rules) !~ (" " name "[\\? ]")); } function report_err(name, msg) { printf ("%s:%d.%d:", file, NR, RSTART + RLENGTH); printf (" [%s] %s\n", name, msg); got_errors = 1; } function err(name, msg) { ++ counts[name]; if (is_err(name) && counts[name] <= 10) { report_err(name, msg); if (counts[name] == 10){ printf ("WARNING: too many [%s] in this file.", name); printf (" Others will not be reported.\n"); } } } function err_if(guard, name, msg) { if (is_err(guard)) { err(name, msg); } else { ++ counts[name]; } } function more_columns(str, limit, c){ c = 0; for (i = 1; i <= length(str); i++){ if (substr(str, i, 1) == "\t"){ c = int((c + 8) / 8) * 8; }else{ ++ c; } } return c > limit; } function utf8_decode(str) { if (is_err("non-ascii")) { return str; } else { # This script assumes that the UTF-8 has been externally validated t = str; gsub(/[\300-\367][\200-\277]+/, "?", t); if (t != str) { ++ counts["non-ascii"]; } return t; } } BEGIN { state = "(first line)"; in_recipe = 0; in_continuation = 0; } # Makefile recipe automaton # in_continuation == 1 if the line ends with a backslash # in_recipe is: # 0 - not in a recipe # 1 - target line scanned, but not yet seen first recipe line # 2 - scanning recipe lines # Non-recipe line match($0, /^[^\t#] *[^# ]/) { if (!in_continuation) { if (!match($0, /^(ifn?eq|else|endif)/)) { in_recipe = 0; } } } # target: or target:: line match($0, /^[^#]*[^:#]::?($|[^=])/) { if (!in_continuation) { in_recipe = 1; } } match($0, /^\t[^\t]+$/) { if (in_recipe == 0 \ || in_recipe == 1 && in_continuation \ || is_err("makefile-whitespace")) { err("tab", "TAB character(s)"); } else { ++ counts["makefile-whitespace"]; in_recipe = 2; } } match($0, /.$/) { in_continuation = (substr($0, length($0)) == "\\"); } match($0, /.\t/) { err("tab", "TAB character(s)"); t = utf8_decode($0); if (more_columns(t, 80)){ RSTART=81; RLENGTH = 0; err_if("very-long-line", "long-line", "line is over 80 columns"); } if (more_columns(t, 132)){ RSTART=133; RLENGTH = 0; err("very-long-line", "line is over 132 columns"); } } match($0, /[\200-\377]/) \ && state != "authors" && state != "copyright" { if (is_err("non-ascii")) { err("non-ascii", "non-ASCII character(s)"); } else { ++ counts["non-ascii"]; } } match($0, /@(since|after|before) +[0-9.]+/) { if (match($0, / [34]\.[0-9]([^0-9]|$)/)) { err("doc-version", "version should be formatted M.mm (e.g. 4.08)"); } else { if (match($0, / [5-9][0-9]*\.0[0-9]/)) { err("doc-version", "version should be formatted M.m (e.g. 5.0)"); } else { if (match($0, / [0-9]+\.[0-9]+\.0/)) { err("doc-version", "version should not include .0 revision"); } } } } match($0, /[^\t\200-\377 -~]/) { err("non-printing", "non-printing ASCII character(s)"); } match($0, /[ \t]+$/) { err("white-at-eol", "whitespace at end of line"); } match($0, /\$Id(: .*)?\$/) { err("svn-keyword", "SVN keyword marker"); } $0 !~ /\t/ && length($0) > 80 { t = utf8_decode($0); sub(/https?:[A-Za-z0-9._~:\/?#\[\]@!$&\047()*+,;=%-]{73,}/, "", t); if (length(t) > 80) { RSTART = 81; RLENGTH = 0; err_if("very-long-line", "long-line", "line is over 80 columns"); } } $0 !~ /\t/ && length($0) > 132 { RSTART = 133; RLENGTH = 0; t = utf8_decode($0); sub(/https?:[A-Za-z0-9._~:\/?#\[\]@!$&\047()*+,;=%-]{125,}/, "", t); if (length(t) > 132) { err("very-long-line", "line is over 132 columns"); } } # Header-recognition automaton. Read this from bottom to top. # Valid UTF-8 chars are recognised in copyright and authors # TODO: ensure all files are valid UTF-8 before awking them. # Note that this code also assumes that combining characters are NOT # used (i.e. that every Unicode code-point corresponds to exactly # one displayed character, i.e. no Camels and no including # weird-and-wonderful ways of encoded accented letters). state == "close" && $0 ~ /\*{74}/ { state = "OK"; } state == "close" { state = "(last line)"; } state == "blurb" && $0 ~ /\* {72}\*/ { state = "close"; } state == "blurb" && $0 ~ /\/LICENSE/ { state = "(license path)" } state == "blurb1" && $0 ~ /\* All rights reserved. .{47} \*/ \ { state = "blurb"; } state == "blurb1" { state = "(blurb line 1)"; } state == "copyright" && $0 ~ /\* {72}\*/ { state = "blurb1"; } state == "copyright" \ && $0 !~ /\* Copyright [0-9]{4}([\300-\367][\200-\277]+|.){54} \*/ \ && $0 !~ /\* ([\300-\367][\200-\277]+|.){66} \*/ \ { state = "(copyright lines)"; } state == "authors" && $0 ~ /\* {72}\*/ { state = "copyright"; } state == "authors" \ && $0 !~ /\* ([\300-\367][\200-\277]+|.){70} \*/ \ { state = "(authors)"; } state == "blank2" && $0 ~ /\* {72}\*/ { state = "authors"; } state == "blank2" { state = "(blank line 2)"; } state == "title" && $0 ~ /\* {33}OCaml {34}\*/ { state = "blank2"; } state == "title" { state = "(title line)"; } state == "blank1" && $0 ~ /\* {72}\*/ { state = "title"; } state == "blank1" { state = "(blank line 1)"; } state == "(first line)" && NR < 4 && $0 ~ /\*{74}/ { state = "blank1"; } { prev_line = last_line; last_line = $0; } END { if (match(last_line, /.+/)){ err("missing-lf", "missing linefeed at EOF"); prev_line = last_line; ++ NR; empty_file = 0; }else{ empty_file = NR == 1; } if (!empty_file && match(prev_line, /^$/)){ err("white-at-eof", "empty line(s) at EOF"); } if (state != "OK"){ if (NR >= 10){ NR = 1; RSTART = 1; RLENGTH = 0; err("missing-header", sprintf("bad copyright header %s", state)); }else{ counts["missing-header"] = 1; } } split(attr_rules, r, "[? ]"); for (i in r){ name = r[i]; if (name != "" && !counts[name]){ NR = 1; RSTART = 1; RLENGTH = 0; if (attr_rules !~ (" " name "\\? ")) { report_err(name, sprintf("attribute is unused", name)); } } } exit got_errors; } ' ; then EXIT_CODE=1 fi done exit $EXIT_CODE )