From eb15c126c34212c1fe472cbabc6ff785f4d48e97 Mon Sep 17 00:00:00 2001 From: Colin Cross Date: Mon, 8 Mar 2021 17:55:50 -0800 Subject: [PATCH] Fix numericStringLess and add tests numericStringLess("1a", "11a") would strip the equal prefix "1" and then compare the bytes "a" and "1", when it should have compared the numbers 1 and 11. Fix it by handling the case where the last equal byte was numeric and the first differing byte is numeric in one string and non-numeric in the other. numericStringLess("12", "101") would strip the equal prefix "1" and then compare the numbers 2 and 01, when it should have compared the numbers 12 and 101. Fix it by tracking the beginning of the sequence of numeric bytes containing the differing byte. Test: sort_test.go Change-Id: I8d9252a64625ba6a3c75d09bb1429dcb1115e3e1 --- Blueprints | 1 + parser/sort.go | 106 ++++++++++++++++++++++++++++---------------- parser/sort_test.go | 96 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 166 insertions(+), 37 deletions(-) create mode 100644 parser/sort_test.go diff --git a/Blueprints b/Blueprints index 364fbd0..ab9fd3c 100644 --- a/Blueprints +++ b/Blueprints @@ -47,6 +47,7 @@ bootstrap_go_package { "parser/modify_test.go", "parser/parser_test.go", "parser/printer_test.go", + "parser/sort_test.go", ], } diff --git a/parser/sort.go b/parser/sort.go index 6d6c33c..0379d45 100644 --- a/parser/sort.go +++ b/parser/sort.go @@ -22,64 +22,96 @@ import ( "text/scanner" ) +// numericStringLess compares two strings, returning a lexicographical comparison unless the first +// difference occurs in a sequence of 1 or more numeric characters, in which case it returns the +// numerical comparison of the two numbers. func numericStringLess(a, b string) bool { - byteIndex := 0 - // Start with a byte comparison to find where the strings differ - for ; byteIndex < len(a) && byteIndex < len(b); byteIndex++ { - if a[byteIndex] != b[byteIndex] { - break - } - } - - if byteIndex == len(a) && byteIndex != len(b) { - // Reached the end of a. a is a prefix of b. - return true - } else if byteIndex == len(b) { - // Reached the end of b. b is a prefix of a or b is equal to a. - return false - } - - // Save the first differing bytes in case we have to fall back to a byte comparison. - aDifferingByte := a[byteIndex] - bDifferingByte := b[byteIndex] - - // Save the differing suffixes of the strings. This may be invalid utf8 if the first - // rune was cut, but that's fine because are only looking for the bytes '0'-'9', which - // can only occur as single-byte runes in utf8. - aDifference := a[byteIndex:] - bDifference := b[byteIndex:] - isNumeric := func(r rune) bool { return r >= '0' && r <= '9' } isNotNumeric := func(r rune) bool { return !isNumeric(r) } - // If the first runes are both numbers do a numeric comparison. - if isNumeric(rune(aDifferingByte)) && isNumeric(rune(bDifferingByte)) { + minLength := len(a) + if len(b) < minLength { + minLength = len(b) + } + + byteIndex := 0 + numberStartIndex := -1 + + var aByte, bByte byte + + // Start with a byte comparison to find where the strings differ. + for ; byteIndex < minLength; byteIndex++ { + aByte, bByte = a[byteIndex], b[byteIndex] + if aByte != bByte { + break + } + byteIsNumeric := isNumeric(rune(aByte)) + if numberStartIndex != -1 && !byteIsNumeric { + numberStartIndex = -1 + } else if numberStartIndex == -1 && byteIsNumeric { + numberStartIndex = byteIndex + } + } + + // Handle the case where we reached the end of one or both strings without finding a difference. + if byteIndex == minLength { + if len(a) < len(b) { + // Reached the end of a. a is a prefix of b. + return true + } else { + // Reached the end of b. b is a prefix of a or b is equal to a. + return false + } + } + + aByteNumeric := isNumeric(rune(aByte)) + bByteNumeric := isNumeric(rune(bByte)) + + if (aByteNumeric || bByteNumeric) && !(aByteNumeric && bByteNumeric) && numberStartIndex != -1 { + // Only one of aByte and bByte is a number, but the previous byte was a number. That means + // one is a longer number with the same prefix, which must be numerically larger. If bByte + // is a number then the number in b is numerically larger than the number in a. + return bByteNumeric + } + + // If the bytes are both numbers do a numeric comparison. + if aByteNumeric && bByteNumeric { + // Extract the numbers from each string, starting from the first number after the last + // non-number. This won't be invalid utf8 because we are only looking for the bytes + //'0'-'9', which can only occur as single-byte runes in utf8. + if numberStartIndex == -1 { + numberStartIndex = byteIndex + } + aNumberString := a[numberStartIndex:] + bNumberString := b[numberStartIndex:] + // Find the first non-number in each, using the full length if there isn't one. - endANumbers := strings.IndexFunc(aDifference, isNotNumeric) - endBNumbers := strings.IndexFunc(bDifference, isNotNumeric) + endANumbers := strings.IndexFunc(aNumberString, isNotNumeric) + endBNumbers := strings.IndexFunc(bNumberString, isNotNumeric) if endANumbers == -1 { - endANumbers = len(aDifference) + endANumbers = len(aNumberString) } if endBNumbers == -1 { - endBNumbers = len(bDifference) + endBNumbers = len(bNumberString) } + // Convert each to an int. - aNumber, err := strconv.Atoi(aDifference[:endANumbers]) + aNumber, err := strconv.Atoi(aNumberString[:endANumbers]) if err != nil { panic(fmt.Errorf("failed to convert %q from %q to number: %w", - aDifference[:endANumbers], a, err)) + aNumberString[:endANumbers], a, err)) } - bNumber, err := strconv.Atoi(bDifference[:endBNumbers]) + bNumber, err := strconv.Atoi(bNumberString[:endBNumbers]) if err != nil { panic(fmt.Errorf("failed to convert %q from %q to number: %w", - bDifference[:endBNumbers], b, err)) + bNumberString[:endBNumbers], b, err)) } // Do a numeric comparison. return aNumber < bNumber } // At least one is not a number, do a byte comparison. - return aDifferingByte < bDifferingByte + return aByte < bByte } func SortLists(file *File) { diff --git a/parser/sort_test.go b/parser/sort_test.go new file mode 100644 index 0000000..0a9e7fc --- /dev/null +++ b/parser/sort_test.go @@ -0,0 +1,96 @@ +// Copyright 2021 Google Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package parser + +import "testing" + +func Test_numericStringLess(t *testing.T) { + type args struct { + a string + b string + } + tests := []struct { + a, b string + }{ + {"a", "b"}, + {"aa", "ab"}, + {"aaa", "aba"}, + + {"1", "2"}, + {"1", "11"}, + {"2", "11"}, + {"1", "12"}, + + {"12", "101"}, + {"11", "102"}, + + {"0", "1"}, + {"0", "01"}, + {"1", "02"}, + {"01", "002"}, + {"001", "02"}, + } + + oneTest := func(a, b string, want bool) { + t.Helper() + if got := numericStringLess(a, b); got != want { + t.Errorf("want numericStringLess(%v, %v) = %v, got %v", a, b, want, got) + } + } + + for _, tt := range tests { + t.Run(tt.a+"<"+tt.b, func(t *testing.T) { + // a should be less than b + oneTest(tt.a, tt.b, true) + // b should not be less than a + oneTest(tt.b, tt.a, false) + // a should not be less than a + oneTest(tt.a, tt.a, false) + // b should not be less than b + oneTest(tt.b, tt.b, false) + + // The same should be true both strings are prefixed with an "a" + oneTest("a"+tt.a, "a"+tt.b, true) + oneTest("a"+tt.b, "a"+tt.a, false) + oneTest("a"+tt.a, "a"+tt.a, false) + oneTest("a"+tt.b, "a"+tt.b, false) + + // The same should be true both strings are suffixed with an "a" + oneTest(tt.a+"a", tt.b+"a", true) + oneTest(tt.b+"a", tt.a+"a", false) + oneTest(tt.a+"a", tt.a+"a", false) + oneTest(tt.b+"a", tt.b+"a", false) + + // The same should be true both strings are suffixed with a "1" + oneTest(tt.a+"1", tt.b+"1", true) + oneTest(tt.b+"1", tt.a+"1", false) + oneTest(tt.a+"1", tt.a+"1", false) + oneTest(tt.b+"1", tt.b+"1", false) + + // The same should be true both strings are prefixed with a "0" + oneTest("0"+tt.a, "0"+tt.b, true) + oneTest("0"+tt.b, "0"+tt.a, false) + oneTest("0"+tt.a, "0"+tt.a, false) + oneTest("0"+tt.b, "0"+tt.b, false) + + // The same should be true both strings are suffixed with a "0" + oneTest(tt.a+"0", tt.b+"0", true) + oneTest(tt.b+"0", tt.a+"0", false) + oneTest(tt.a+"0", tt.a+"0", false) + oneTest(tt.b+"0", tt.b+"0", false) + + }) + } +}