Sort hiddenapi monolithic files by signature

Adds a new --key_field option to merge_csv.py which specifies the name
of the field that should be used to sort the input. If specified it
causes that field to be the first in each row and performs the merge
operation of a merge sort on the input files. That assumes that each
input file is already sorted into the same order.

Modifies the rules that use merge_csv.py to pass in:
    --key_field signature
to sort the rows by signature.

Bug: 180387396
Test: Verified that hiddenapi files (both aggregated ones and for the
      individual modules) are not affected by this change other than
      changing the order.
Change-Id: Idcd5f0fea373b520b604889e1c280f21ed495660
This commit is contained in:
Paul Duffin 2021-02-16 16:57:06 +00:00
parent 82b3fcf123
commit 2c36f24082
3 changed files with 33 additions and 4 deletions

View file

@ -254,6 +254,7 @@ func (h *hiddenAPI) hiddenAPIExtractInformation(ctx android.ModuleContext, dexJa
rule.Command().
BuiltTool("merge_csv").
Flag("--zip_input").
Flag("--key_field signature").
FlagWithOutput("--output=", indexCSV).
Inputs(classesJars)
rule.Build("merged-hiddenapi-index", "Merged Hidden API index")

View file

@ -424,6 +424,7 @@ func metadataRule(ctx android.SingletonContext) android.Path {
rule.Command().
BuiltTool("merge_csv").
Flag("--key_field signature").
FlagWithOutput("--output=", outputPath).
Inputs(metadataCSV)
@ -535,6 +536,7 @@ func (h *hiddenAPIIndexSingleton) GenerateBuildActions(ctx android.SingletonCont
rule := android.NewRuleBuilder(pctx, ctx)
rule.Command().
BuiltTool("merge_csv").
Flag("--key_field signature").
FlagWithArg("--header=", "signature,file,startline,startcol,endline,endcol,properties").
FlagWithOutput("--output=", hiddenAPISingletonPaths(ctx).index).
Inputs(indexes)

View file

@ -20,6 +20,9 @@ Merge multiple CSV files, possibly with different columns.
import argparse
import csv
import io
import heapq
import itertools
import operator
from zipfile import ZipFile
@ -28,6 +31,10 @@ args_parser.add_argument('--header', help='Comma separated field names; '
'if missing determines the header from input files.')
args_parser.add_argument('--zip_input', help='Treat files as ZIP archives containing CSV files to merge.',
action="store_true")
args_parser.add_argument('--key_field', help='The name of the field by which the rows should be sorted. '
'Must be in the field names. '
'Will be the first field in the output. '
'All input files must be sorted by that field.')
args_parser.add_argument('--output', help='Output file for merged CSV.',
default='-', type=argparse.FileType('w'))
args_parser.add_argument('files', nargs=argparse.REMAINDER)
@ -57,10 +64,29 @@ else:
headers = headers.union(reader.fieldnames)
fieldnames = sorted(headers)
# Concatenate all files to output:
# By default chain the csv readers together so that the resulting output is
# the concatenation of the rows from each of them:
all_rows = itertools.chain.from_iterable(csv_readers)
if len(csv_readers) > 0:
keyField = args.key_field
if keyField:
assert keyField in fieldnames, (
"--key_field {} not found, must be one of {}\n").format(
keyField, ",".join(fieldnames))
# Make the key field the first field in the output
keyFieldIndex = fieldnames.index(args.key_field)
fieldnames.insert(0, fieldnames.pop(keyFieldIndex))
# Create an iterable that performs a lazy merge sort on the csv readers
# sorting the rows by the key field.
all_rows = heapq.merge(*csv_readers, key=operator.itemgetter(keyField))
# Write all rows from the input files to the output:
writer = csv.DictWriter(args.output, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL,
dialect='unix', fieldnames=fieldnames)
writer.writeheader()
for reader in csv_readers:
for row in reader:
writer.writerow(row)
# Read all the rows from the input and write them to the output in the correct
# order:
for row in all_rows:
writer.writerow(row)