From 971b8702719cf2ed697a6017deeff1f2facc0080 Mon Sep 17 00:00:00 2001 From: Kerin Millar Date: Thu, 1 Aug 2024 20:16:02 +0100 Subject: Render contains_all() and contains_any() faster Re-implement the contains_all() and contains_any() functions in such a way that they are faster than their forebears by an order of magnitude. In order to achieve this level of performance, the value of IFS is no longer taken into account. Instead, words are always presumed to be separated by characters matching the [[:space:]] character class. Consider a scenario in which the FEATURES variable is comprised of 33 words. $ FEATURES="assume-digests binpkg-docompress binpkg-dostrip binpkg-logs buildpkg buildpkg-live config-protect-if-modified distlocks ebuild-locks fixlafiles ipc-sandbox merge-sync merge-wait multilib-strict network-sandbox news parallel-fetch pid-sandbox pkgdir-index-trusted preserve-libs protect-owned qa-unresolved-soname-deps sandbox sfperms strict unknown-features-warn unmerge-logs unmerge-orphans userfetch userpriv usersandbox usersync xattr" Let's say that the contains_any function is used to search for 10 words, where only the 10th can be matched and where FEATURES must be scanned in its entirety exactly 10 times. $ contains_any "$FEATURES" the quick brown fox jumped over the lazy hen xattr The following benchmarks show how long it took to call the function 50,000 times consecutively on a system with an Apple M1 CPU for both the original and new implementations. This is with the dash shell. contains_any (BEFORE) real 0m19.135s user 0m16.781s sys 0m2.258s contains_any (AFTER) real 0m1.571s user 0m1.497s sys 0m0.063s Now let's say that the contains_all function is used to search for 3 words, where all can be matched while requiring for FEATURES to be scanned in its entirety at least once. $ contains_all "$FEATURES" assume-digests news xattr Again, The following benchmarks show how long it took to call the function 50,000 times consecutively. contains_all (BEFORE) real 1m8.052s user 0m19.363s sys 0m42.742s contains_all (AFTER) real 0m0.689s user 0m0.627s sys 0m0.057s The performance improvements are similarly impressive if using bash. Signed-off-by: Kerin Millar --- functions.sh | 138 ++++++++++++++++++--------------------------------------- test-functions | 125 +++++++++++++-------------------------------------- 2 files changed, 74 insertions(+), 189 deletions(-) diff --git a/functions.sh b/functions.sh index 036e3a7..f120564 100644 --- a/functions.sh +++ b/functions.sh @@ -22,7 +22,7 @@ # COLUMNS : may be used by _update_columns() to get the column count # EPOCHREALTIME : potentially used by _update_time() to get the time # GENFUN_MODULES : which of the optional function collections must be sourced -# IFS : affects contains_all(), contains_any() and warn() +# IFS : warn() operands are joined by its first character # INVOCATION_ID : used by from_unit() # PORTAGE_BIN_PATH : used by from_portage() # RC_OPENRC_PID : used by from_runscript() @@ -54,112 +54,60 @@ chdir() } # -# Takes the first parameter as a string comprising zero or more words, composes -# a set consisting of the intersection of those words, then determines whether -# the intersection of the remaining parameters forms a subset thereof. The -# words shall be collected by splitting the string into individual fields, in -# accordance with section 2.6.5 of the Shell Command Language specification. -# Therefore, the value of IFS shall be taken into account. If fewer than two -# parameters are provided, or if the first parameter yields no fields, or if the -# second set is disjoint from - or a superset of - the first, the return value -# shall be greater than 0. +# Considers the first parameter as a string comprising zero or more +# whitespace-separated words then determines whether all of the remaining +# parameters can be found within the resulting list in their capacity as +# discrete words. If they cannot be, or if fewer than two parameters were given, +# the return value shall be 1. Of the words to be searched for, any which are +# empty or which contain whitespace characters shall be deemed unfindable. # contains_all() { - # shellcheck disable=2097,2098 - [ "$#" -ge 2 ] && - IFS=${IFS} awk -v ifs_set=${IFS+1} -f - -- "$@" <<-'EOF' - BEGIN { - haystack = ARGV[1] - argc = ARGC - ARGC = 1 - if (ifs_set) { - ifs = ENVIRON["IFS"] - } else { - ifs = " \t\n" - } - # Translate IFS to FS, in accordance with section 2.6.5. - if (length(ifs) == 0) { - FS = "^" - } else { - fs = "(" - for (i = 1; i <= length(ifs); i++) { - char = substr(ifs, i, 1) - if (seen[char]++) { - continue - } else if (char ~ /[ \t\n]/) { - whitespace = whitespace char - fs = fs "[" char "]+|" - } else { - fs = fs "[" char "]|" - } - } - sub(/\|$/, "", fs) - FS = fs = fs ")" - } - # Leading whitespace characters must be removed. - if (length(whitespace) > 0) { - sub("^[" whitespace "]+", "", haystack) - } - # In sh, fields are terminated, not separated. - sub(FS "$", "", haystack) - len = split(haystack, words) - for (i = 1; i <= len; i++) { - set2[words[i]] - } - for (i = 2; i < argc; i++) { - set1[ARGV[i]] - } - for (word in set2) { - delete set1[word] - } - for (word in set1) { - exit 1 - } - } - EOF + local arg haystack + + [ "$#" -gt 1 ] || return + haystack=" $1 " + shift + for arg; do + case ${arg} in + ''|*[[:space:]]*) + return 1 + esac + case ${haystack} in + *" ${arg} "*) + ;; + *) + return 1 + esac + done } # -# Takes the first parameter as a string comprising zero or more words then -# determines whether at least one of the remaining parameters can be matched -# against any of those words. The words shall be collected by splitting the -# string into individual fields, in accordance with section 2.6.5 of the Shell -# Command Language specification. Therefore, the value of IFS shall be taken -# into account. If fewer than two parameters are provided, or if the first -# parameter yields no fields, or if none of the following parameters can be -# matched, the return value shall be greater than 0. +# Considers the first parameter as a string comprising zero or more +# whitespace-separated words then determines whether any of the remaining +# parameters can be found within the resulting list in their capacity as +# discrete words. If none can be, or if no parameters were given, the return +# value shall be greater than 0. Of the words to be searched for, any which are +# empty or which contain whitespace characters shall be disregarded. # contains_any() { - local had_noglob haystack i item needle retval + local arg haystack - [ "$#" -ge 2 ] || return - haystack=$1 + [ "$#" -gt 0 ] || return + haystack=" $1 " shift - i=0 - case $- in - *f*) - had_noglob=1 - ;; - *) - had_noglob=0 - esac - set -f - for needle; do - if [ "$(( i += 1 ))" -eq 1 ]; then - # shellcheck disable=2086 - set -- ${haystack} - fi - for item; do - [ "${item}" = "${needle}" ] && break 2 - done + for arg; do + case ${arg} in + ''|*[[:space:]]*) + continue + esac + case ${haystack} in + *" ${arg} "*) + return 0 + esac done - retval=$? - if [ "${had_noglob}" -eq 0 ]; then - set +f - fi - return "${retval}" + false } # diff --git a/test-functions b/test-functions index ef2aa98..f11234a 100755 --- a/test-functions +++ b/test-functions @@ -744,105 +744,42 @@ test_substr() { test_contains_all() { set -- \ - ge 1 default N/A N/A N/A N/A \ - ge 1 default ' foo bar ' '' N/A N/A \ - ge 1 default ' foo bar ' '' ' ' N/A \ - ge 1 default ' foo bar ' '' ' bar' N/A \ - ge 1 default ' foo bar ' '' 'foo ' N/A \ - ge 1 default ' foo bar ' '' 'foo bar' N/A \ - ge 1 default ' foo bar ' ' ' '' N/A \ - ge 1 default ' foo bar ' ' ' ' ' N/A \ - ge 1 default ' foo bar ' ' ' N/A N/A \ - ge 1 default ' foo bar ' ' bar' '' N/A \ - ge 1 default ' foo bar ' ' bar' N/A N/A \ - ge 1 default ' foo bar ' 'foo ' '' N/A \ - ge 1 default ' foo bar ' 'foo ' ' bar' N/A \ - ge 1 default ' foo bar ' 'foo ' N/A N/A \ - ge 1 default ' foo bar ' 'foo bar' '' N/A \ - ge 1 default ' foo bar ' 'foo bar' N/A N/A \ - ge 1 default ' foo bar ' N/A N/A N/A \ - ge 1 default ' foo bar ' bar foo '' \ - ge 1 default ' foo bar ' bar foo ' ' \ - ge 1 default ' foo bar ' baz bar foo \ - ge 1 default ' foo bar ' fo ba N/A \ - ge 1 default ' foo bar ' foo bar '' \ - ge 1 default ' foo bar ' foo bar ' ' \ - ge 1 default ' foo bar ' foo bar baz \ - ge 1 default ' foo bar ' o a N/A \ - ge 1 default ' foo bar ' oo ar N/A \ - eq 0 default ' foo bar ' foo bar N/A \ - eq 0 default ' foo bar ' bar foo N/A \ - ge 1 ', ' N/A N/A N/A N/A \ - ge 1 ', ' ' foo bar ' '' N/A N/A \ - ge 1 ', ' ' foo bar ' '' ' ' N/A \ - ge 1 ', ' ' foo ,bar, ' '' ',' N/A \ - ge 1 ', ' ' foo bar ' '' ' bar' N/A \ - ge 1 ', ' ' foo ,bar ' '' ',bar' N/A \ - ge 1 ', ' ' foo bar ' '' 'foo ' N/A \ - ge 1 ', ' ' foo, bar ' '' 'foo,' N/A \ - ge 1 ', ' ' foo bar ' '' 'foo bar' N/A \ - ge 1 ', ' ' foo bar ' ' ' '' N/A \ - ge 1 ', ' ' foo,bar, ' ',' '' N/A \ - ge 1 ', ' ' foo bar ' ' ' ' ' N/A \ - ge 1 ', ' ',foo,,bar,' ',' ',' N/A \ - ge 1 ', ' ' foo bar ' ' ' N/A N/A \ - ge 1 ', ' ',foo,,bar,' ',' N/A N/A \ - ge 1 ', ' ' foo bar ' ' bar' '' N/A \ - ge 1 ', ' 'foo,bar,' ',bar' '' N/A \ - ge 1 ', ' ' foo bar ' ' bar' N/A N/A \ - ge 1 ', ' ',foo,,bar,' ',bar' N/A N/A \ - ge 1 ', ' ' foo bar ' 'foo ' '' N/A \ - ge 1 ', ' 'foo,bar,' 'foo,' '' N/A \ - ge 1 ', ' ' foo bar ' 'foo ' ' bar' N/A \ - ge 1 ', ' ',foo,,bar,' 'foo,' ',bar' N/A \ - ge 1 ', ' ' foo bar ' 'foo ' N/A N/A \ - ge 1 ', ' ',foo,,bar,' 'foo,' N/A N/A \ - ge 1 ', ' ' foo bar ' 'foo bar' '' N/A \ - ge 1 ', ' 'foo,bar,' 'foo,bar' '' N/A \ - ge 1 ', ' ' foo bar ' 'foo bar' N/A N/A \ - ge 1 ', ' ',foo,,bar,' 'foo,,bar' N/A N/A \ - ge 1 ', ' ' foo bar ' N/A N/A N/A \ - ge 1 ', ' ' foo bar ' bar foo '' \ - ge 1 ', ' ' foo,bar ' bar foo '' \ - ge 1 ', ' ' foo bar ' bar foo ' ' \ - ge 1 ', ' ' foo,,bar ' bar foo ',' \ - ge 1 ', ' ' foo bar ' baz bar foo \ - ge 1 ', ' ',foo,,bar,' baz bar foo \ - ge 1 ', ' ' foo bar ' fo ba N/A \ - ge 1 ', ' ',foo,,bar,' fo ba N/A \ - ge 1 ', ' ' foo bar ' foo bar '' \ - ge 1 ', ' ' foo,bar ' foo bar '' \ - ge 1 ', ' ' foo bar ' foo bar ' ' \ - ge 1 ', ' ' foo,,bar ' foo bar ',' \ - ge 1 ', ' ' foo bar ' foo bar baz \ - ge 1 ', ' ',foo,,bar,' foo bar baz \ - ge 1 ', ' ' foo bar ' o a N/A \ - ge 1 ', ' ',foo,,bar,' o a N/A \ - ge 1 ', ' ' foo bar ' oo ar N/A \ - ge 1 ', ' ',foo,,bar,' oo ar N/A \ - eq 0 ', ' ' foo bar ' foo bar N/A \ - eq 0 ', ' ',foo bar,' foo bar N/A \ - eq 0 ', ' ' foo,,bar ' foo bar N/A \ - eq 0 ', ' ',foo,,bar,' foo bar N/A \ - eq 0 ', ' ' foo bar ' bar foo N/A \ - eq 0 ', ' ',foo bar,' bar foo N/A \ - eq 0 ', ' ' foo,,bar ' bar foo N/A \ - eq 0 ', ' ',foo,,bar,' bar foo N/A + ge 1 N/A N/A N/A N/A \ + ge 1 ' foo bar ' '' N/A N/A \ + ge 1 ' foo bar ' '' ' ' N/A \ + ge 1 ' foo bar ' '' ' bar' N/A \ + ge 1 ' foo bar ' '' 'foo ' N/A \ + ge 1 ' foo bar ' '' 'foo bar' N/A \ + ge 1 ' foo bar ' ' ' '' N/A \ + ge 1 ' foo bar ' ' ' ' ' N/A \ + ge 1 ' foo bar ' ' ' N/A N/A \ + ge 1 ' foo bar ' ' bar' '' N/A \ + ge 1 ' foo bar ' ' bar' N/A N/A \ + ge 1 ' foo bar ' 'foo ' '' N/A \ + ge 1 ' foo bar ' 'foo ' ' bar' N/A \ + ge 1 ' foo bar ' 'foo ' N/A N/A \ + ge 1 ' foo bar ' 'foo bar' '' N/A \ + ge 1 ' foo bar ' 'foo bar' N/A N/A \ + ge 1 ' foo bar ' N/A N/A N/A \ + ge 1 ' foo bar ' bar foo '' \ + ge 1 ' foo bar ' bar foo ' ' \ + ge 1 ' foo bar ' baz bar foo \ + ge 1 ' foo bar ' fo ba N/A \ + ge 1 ' foo bar ' foo bar '' \ + ge 1 ' foo bar ' foo bar ' ' \ + ge 1 ' foo bar ' foo bar baz \ + ge 1 ' foo bar ' o a N/A \ + ge 1 ' foo bar ' oo ar N/A \ + eq 0 ' foo bar ' foo bar N/A \ + eq 0 ' foo bar ' bar foo N/A callback() { shift - ifs=$1 - shift - if [ "${ifs}" = "default" ]; then - test_description="contains_all $(quote_args "$@")" - contains_all "$@" - else - test_description="IFS='${ifs}' contains_all $(quote_args "$@")" - IFS=${ifs} contains_all "$@" - fi + test_description="contains_all $(quote_args "$@")" + contains_all "$@" } - iterate_tests 7 "$@" + iterate_tests 6 "$@" } test_contains_any() { -- cgit v1.2.3-65-gdbad