From eb15c126c34212c1fe472cbabc6ff785f4d48e97 Mon Sep 17 00:00:00 2001
From: Colin Cross <ccross@android.com>
Date: Mon, 8 Mar 2021 17:55:50 -0800
Subject: [PATCH] Fix numericStringLess and add tests

numericStringLess("1a", "11a") would strip the equal prefix "1" and
then compare the bytes "a" and "1", when it should have compared the
numbers 1 and 11.  Fix it by handling the case where the last equal
byte was numeric and the first differing byte is numeric in one
string and non-numeric in the other.

numericStringLess("12", "101") would strip the equal prefix "1" and
then compare the numbers 2 and 01, when it should have compared the
numbers 12 and 101.  Fix it by tracking the beginning of the sequence
of numeric bytes containing the differing byte.

Test: sort_test.go
Change-Id: I8d9252a64625ba6a3c75d09bb1429dcb1115e3e1
---
 Blueprints          |   1 +
 parser/sort.go      | 106 ++++++++++++++++++++++++++++----------------
 parser/sort_test.go |  96 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 166 insertions(+), 37 deletions(-)
 create mode 100644 parser/sort_test.go

diff --git a/Blueprints b/Blueprints
index 364fbd0..ab9fd3c 100644
--- a/Blueprints
+++ b/Blueprints
@@ -47,6 +47,7 @@ bootstrap_go_package {
         "parser/modify_test.go",
         "parser/parser_test.go",
         "parser/printer_test.go",
+	"parser/sort_test.go",
     ],
 }
 
diff --git a/parser/sort.go b/parser/sort.go
index 6d6c33c..0379d45 100644
--- a/parser/sort.go
+++ b/parser/sort.go
@@ -22,64 +22,96 @@ import (
 	"text/scanner"
 )
 
+// numericStringLess compares two strings, returning a lexicographical comparison unless the first
+// difference occurs in a sequence of 1 or more numeric characters, in which case it returns the
+// numerical comparison of the two numbers.
 func numericStringLess(a, b string) bool {
-	byteIndex := 0
-	// Start with a byte comparison to find where the strings differ
-	for ; byteIndex < len(a) && byteIndex < len(b); byteIndex++ {
-		if a[byteIndex] != b[byteIndex] {
-			break
-		}
-	}
-
-	if byteIndex == len(a) && byteIndex != len(b) {
-		// Reached the end of a.  a is a prefix of b.
-		return true
-	} else if byteIndex == len(b) {
-		// Reached the end of b.  b is a prefix of a or b is equal to a.
-		return false
-	}
-
-	// Save the first differing bytes in case we have to fall back to a byte comparison.
-	aDifferingByte := a[byteIndex]
-	bDifferingByte := b[byteIndex]
-
-	// Save the differing suffixes of the strings.  This may be invalid utf8 if the first
-	// rune was cut, but that's fine because are only looking for the bytes '0'-'9', which
-	// can only occur as single-byte runes in utf8.
-	aDifference := a[byteIndex:]
-	bDifference := b[byteIndex:]
-
 	isNumeric := func(r rune) bool { return r >= '0' && r <= '9' }
 	isNotNumeric := func(r rune) bool { return !isNumeric(r) }
 
-	// If the first runes are both numbers do a numeric comparison.
-	if isNumeric(rune(aDifferingByte)) && isNumeric(rune(bDifferingByte)) {
+	minLength := len(a)
+	if len(b) < minLength {
+		minLength = len(b)
+	}
+
+	byteIndex := 0
+	numberStartIndex := -1
+
+	var aByte, bByte byte
+
+	// Start with a byte comparison to find where the strings differ.
+	for ; byteIndex < minLength; byteIndex++ {
+		aByte, bByte = a[byteIndex], b[byteIndex]
+		if aByte != bByte {
+			break
+		}
+		byteIsNumeric := isNumeric(rune(aByte))
+		if numberStartIndex != -1 && !byteIsNumeric {
+			numberStartIndex = -1
+		} else if numberStartIndex == -1 && byteIsNumeric {
+			numberStartIndex = byteIndex
+		}
+	}
+
+	// Handle the case where we reached the end of one or both strings without finding a difference.
+	if byteIndex == minLength {
+		if len(a) < len(b) {
+			// Reached the end of a.  a is a prefix of b.
+			return true
+		} else {
+			// Reached the end of b.  b is a prefix of a or b is equal to a.
+			return false
+		}
+	}
+
+	aByteNumeric := isNumeric(rune(aByte))
+	bByteNumeric := isNumeric(rune(bByte))
+
+	if (aByteNumeric || bByteNumeric) && !(aByteNumeric && bByteNumeric) && numberStartIndex != -1 {
+		// Only one of aByte and bByte is a number, but the previous byte was a number.  That means
+		// one is a longer number with the same prefix, which must be numerically larger.  If bByte
+		// is a number then the number in b is numerically larger than the number in a.
+		return bByteNumeric
+	}
+
+	// If the bytes are both numbers do a numeric comparison.
+	if aByteNumeric && bByteNumeric {
+		// Extract the numbers from each string, starting from the first number after the last
+		// non-number.  This won't be invalid utf8 because we are only looking for the bytes
+		//'0'-'9', which can only occur as single-byte runes in utf8.
+		if numberStartIndex == -1 {
+			numberStartIndex = byteIndex
+		}
+		aNumberString := a[numberStartIndex:]
+		bNumberString := b[numberStartIndex:]
+
 		// Find the first non-number in each, using the full length if there isn't one.
-		endANumbers := strings.IndexFunc(aDifference, isNotNumeric)
-		endBNumbers := strings.IndexFunc(bDifference, isNotNumeric)
+		endANumbers := strings.IndexFunc(aNumberString, isNotNumeric)
+		endBNumbers := strings.IndexFunc(bNumberString, isNotNumeric)
 		if endANumbers == -1 {
-			endANumbers = len(aDifference)
+			endANumbers = len(aNumberString)
 		}
 		if endBNumbers == -1 {
-			endBNumbers = len(bDifference)
+			endBNumbers = len(bNumberString)
 		}
+
 		// Convert each to an int.
-		aNumber, err := strconv.Atoi(aDifference[:endANumbers])
+		aNumber, err := strconv.Atoi(aNumberString[:endANumbers])
 		if err != nil {
 			panic(fmt.Errorf("failed to convert %q from %q to number: %w",
-				aDifference[:endANumbers], a, err))
+				aNumberString[:endANumbers], a, err))
 		}
-		bNumber, err := strconv.Atoi(bDifference[:endBNumbers])
+		bNumber, err := strconv.Atoi(bNumberString[:endBNumbers])
 		if err != nil {
 			panic(fmt.Errorf("failed to convert %q from %q to number: %w",
-				bDifference[:endBNumbers], b, err))
+				bNumberString[:endBNumbers], b, err))
 		}
 		// Do a numeric comparison.
 		return aNumber < bNumber
 	}
 
 	// At least one is not a number, do a byte comparison.
-	return aDifferingByte < bDifferingByte
+	return aByte < bByte
 }
 
 func SortLists(file *File) {
diff --git a/parser/sort_test.go b/parser/sort_test.go
new file mode 100644
index 0000000..0a9e7fc
--- /dev/null
+++ b/parser/sort_test.go
@@ -0,0 +1,96 @@
+// Copyright 2021 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package parser
+
+import "testing"
+
+func Test_numericStringLess(t *testing.T) {
+	type args struct {
+		a string
+		b string
+	}
+	tests := []struct {
+		a, b string
+	}{
+		{"a", "b"},
+		{"aa", "ab"},
+		{"aaa", "aba"},
+
+		{"1", "2"},
+		{"1", "11"},
+		{"2", "11"},
+		{"1", "12"},
+
+		{"12", "101"},
+		{"11", "102"},
+
+		{"0", "1"},
+		{"0", "01"},
+		{"1", "02"},
+		{"01", "002"},
+		{"001", "02"},
+	}
+
+	oneTest := func(a, b string, want bool) {
+		t.Helper()
+		if got := numericStringLess(a, b); got != want {
+			t.Errorf("want numericStringLess(%v, %v) = %v, got %v", a, b, want, got)
+		}
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.a+"<"+tt.b, func(t *testing.T) {
+			// a should be less than b
+			oneTest(tt.a, tt.b, true)
+			// b should not be less than a
+			oneTest(tt.b, tt.a, false)
+			// a should not be less than a
+			oneTest(tt.a, tt.a, false)
+			// b should not be less than b
+			oneTest(tt.b, tt.b, false)
+
+			// The same should be true both strings are prefixed with an "a"
+			oneTest("a"+tt.a, "a"+tt.b, true)
+			oneTest("a"+tt.b, "a"+tt.a, false)
+			oneTest("a"+tt.a, "a"+tt.a, false)
+			oneTest("a"+tt.b, "a"+tt.b, false)
+
+			// The same should be true both strings are suffixed with an "a"
+			oneTest(tt.a+"a", tt.b+"a", true)
+			oneTest(tt.b+"a", tt.a+"a", false)
+			oneTest(tt.a+"a", tt.a+"a", false)
+			oneTest(tt.b+"a", tt.b+"a", false)
+
+			// The same should be true both strings are suffixed with a "1"
+			oneTest(tt.a+"1", tt.b+"1", true)
+			oneTest(tt.b+"1", tt.a+"1", false)
+			oneTest(tt.a+"1", tt.a+"1", false)
+			oneTest(tt.b+"1", tt.b+"1", false)
+
+			// The same should be true both strings are prefixed with a "0"
+			oneTest("0"+tt.a, "0"+tt.b, true)
+			oneTest("0"+tt.b, "0"+tt.a, false)
+			oneTest("0"+tt.a, "0"+tt.a, false)
+			oneTest("0"+tt.b, "0"+tt.b, false)
+
+			// The same should be true both strings are suffixed with a "0"
+			oneTest(tt.a+"0", tt.b+"0", true)
+			oneTest(tt.b+"0", tt.a+"0", false)
+			oneTest(tt.a+"0", tt.a+"0", false)
+			oneTest(tt.b+"0", tt.b+"0", false)
+
+		})
+	}
+}