Skip to content

Commit

Permalink
Add API to regenerate index from CARv1 or CARv2
Browse files Browse the repository at this point in the history
The index generation APIs either allowed reading an existing index from
a CARv2 or explicitly required a CARv1 to generate index.

Introduce APIs to make it easier for users that want to regenerate the
index regardless of whether it exists in a CAR file or not. The index
generation APIs are changed to accept either of the formats and
re-generate the index from the data payload unless `ReadOrGenerate` is
called.

Adjust the tests to run for all flavours of index generation with both
CARv1 and CARv2 payload.
  • Loading branch information
masih committed Jun 30, 2022
1 parent c65f0bf commit 7ba9372
Show file tree
Hide file tree
Showing 2 changed files with 144 additions and 88 deletions.
98 changes: 81 additions & 17 deletions v2/index_gen.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,12 @@ import (
"github.com/multiformats/go-varint"
)

// GenerateIndex generates index for a given car in v1 format.
// GenerateIndex generates index for the given car payload reader.
// The index can be stored in serialized format using index.WriteTo.
// See LoadIndex.
//
// Note, the index is re-generated every time even if the payload is in CARv2 format and already has
// an index. To read existing index when available see ReadOrGenerateIndex.
// See: LoadIndex.
func GenerateIndex(v1r io.Reader, opts ...Option) (index.Index, error) {
wopts := ApplyOptions(opts...)
idx, err := index.New(wopts.IndexCodec)
Expand All @@ -28,21 +31,63 @@ func GenerateIndex(v1r io.Reader, opts ...Option) (index.Index, error) {
return idx, nil
}

// LoadIndex populates idx with index records generated from v1r.
// The v1r must be data payload in CARv1 format.
func LoadIndex(idx index.Index, v1r io.Reader, opts ...Option) error {
reader := internalio.ToByteReadSeeker(v1r)
header, err := carv1.ReadHeader(reader)
// LoadIndex populates idx with index records generated from r.
// The r may be in CARv1 or CARv2 format.
//
// Note, the index is re-generated every time even if r is in CARv2 format and already has an index.
// To read existing index when available see ReadOrGenerateIndex.
func LoadIndex(idx index.Index, r io.Reader, opts ...Option) error {
reader := internalio.ToByteReadSeeker(r)
pragma, err := carv1.ReadHeader(r)
if err != nil {
return fmt.Errorf("error reading car header: %w", err)
}

if header.Version != 1 {
return fmt.Errorf("expected version to be 1, got %v", header.Version)
}
var dataSize, dataOffset int64
switch pragma.Version {
case 1:
break
case 2:
// Read V2 header which should appear immediately after pragma according to CARv2 spec.
var v2h Header
_, err := v2h.ReadFrom(r)
if err != nil {
return err
}

// Parse Options.
o := ApplyOptions(opts...)
// Sanity-check the CARv2 header
if v2h.DataOffset < HeaderSize {
return fmt.Errorf("malformed CARv2; data offset too small: %d", v2h.DataOffset)
}
if v2h.DataSize < 1 {
return fmt.Errorf("malformed CARv2; data payload size too small: %d", v2h.DataSize)
}

// Seek to the beginning of the inner CARv1 payload
_, err = reader.Seek(int64(v2h.DataOffset), io.SeekStart)
if err != nil {
return err
}

// Set dataSize and dataOffset which are then used during index loading logic to decide
// where to stop and adjust section offset respectively.
// Note that we could use a LimitReader here and re-define reader with it. However, it means
// the internalio.ToByteReadSeeker will be less efficient since LimitReader does not
// implement ByteReader nor ReadSeeker.
dataSize = int64(v2h.DataSize)
dataOffset = int64(v2h.DataOffset)

// Read the inner CARv1 header to skip it and sanity check it.
v1h, err := carv1.ReadHeader(reader)
if err != nil {
return err
}
if v1h.Version != 1 {
return fmt.Errorf("expected data payload header version of 1; got %d", v1h.Version)
}
default:
return fmt.Errorf("expected either version 1 or 2; got %d", pragma.Version)
}

// Record the start of each section, with first section starring from current position in the
// reader, i.e. right after the header, since we have only read the header so far.
Expand All @@ -55,6 +100,13 @@ func LoadIndex(idx index.Index, v1r io.Reader, opts ...Option) error {
return err
}

// Subtract the data offset; if CARv1 this would be zero otherwise the value will come from the
// CARv2 header.
sectionOffset -= dataOffset

// Parse Options.
o := ApplyOptions(opts...)

records := make([]index.Record, 0)
for {
// Read the section's length.
Expand Down Expand Up @@ -94,6 +146,14 @@ func LoadIndex(idx index.Index, v1r io.Reader, opts ...Option) error {
if sectionOffset, err = reader.Seek(remainingSectionLen, io.SeekCurrent); err != nil {
return err
}
// Subtract the data offset which will be non-zero when reader represents a CARv2.
sectionOffset -= dataOffset

// Check if we have reached the end of data payload and if so treat it as an EOF.
// Note, dataSize will be non-zero only if we are reading from a CARv2.
if dataSize != 0 && sectionOffset >= dataSize {
break
}
}

if err := idx.Load(records); err != nil {
Expand All @@ -103,16 +163,20 @@ func LoadIndex(idx index.Index, v1r io.Reader, opts ...Option) error {
return nil
}

// GenerateIndexFromFile walks a car v1 file at the give path and generates an index of cid->byte offset.
// The index can be stored using index.WriteTo.
// See GenerateIndex.
func GenerateIndexFromFile(path string) (index.Index, error) {
// GenerateIndexFromFile walks a CAR file at the give path and generates an index of cid->byte offset.
// The index can be stored using index.WriteTo. Both CARv1 and CARv2 formats are accepted.
//
// Note, the index is re-generated every time even if the given CAR file is in CARv2 format and
// already has an index. To read existing index when available see ReadOrGenerateIndex.
//
// See: GenerateIndex.
func GenerateIndexFromFile(path string, opts ...Option) (index.Index, error) {
f, err := os.Open(path)
if err != nil {
return nil, err
}
defer f.Close()
return GenerateIndex(f)
return GenerateIndex(f, opts...)
}

// ReadOrGenerateIndex accepts both CARv1 and CARv2 formats, and reads or generates an index for it.
Expand Down
134 changes: 63 additions & 71 deletions v2/index_gen_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ import (
"github.com/stretchr/testify/require"
)

func TestReadOrGenerateIndex(t *testing.T) {
func TestGenerateIndex(t *testing.T) {
tests := []struct {
name string
carPath string
Expand All @@ -26,24 +26,21 @@ func TestReadOrGenerateIndex(t *testing.T) {
wantErr bool
}{
{
"CarV1IsIndexedAsExpected",
"testdata/sample-v1.car",
[]carv2.Option{},
func(t *testing.T) index.Index {
name: "CarV1IsIndexedAsExpected",
carPath: "testdata/sample-v1.car",
wantIndexer: func(t *testing.T) index.Index {
v1, err := os.Open("testdata/sample-v1.car")
require.NoError(t, err)
defer v1.Close()
want, err := carv2.GenerateIndex(v1)
require.NoError(t, err)
return want
},
false,
},
{
"CarV2WithIndexIsReturnedAsExpected",
"testdata/sample-wrapped-v2.car",
[]carv2.Option{},
func(t *testing.T) index.Index {
name: "CarV2WithIndexIsReturnedAsExpected",
carPath: "testdata/sample-wrapped-v2.car",
wantIndexer: func(t *testing.T) index.Index {
v2, err := os.Open("testdata/sample-wrapped-v2.car")
require.NoError(t, err)
defer v2.Close()
Expand All @@ -53,53 +50,47 @@ func TestReadOrGenerateIndex(t *testing.T) {
require.NoError(t, err)
return want
},
false,
},
{
"CarV1WithZeroLenSectionIsGeneratedAsExpected",
"testdata/sample-v1-with-zero-len-section.car",
[]carv2.Option{carv2.ZeroLengthSectionAsEOF(true)},
func(t *testing.T) index.Index {
name: "CarV1WithZeroLenSectionIsGeneratedAsExpected",
carPath: "testdata/sample-v1-with-zero-len-section.car",
opts: []carv2.Option{carv2.ZeroLengthSectionAsEOF(true)},
wantIndexer: func(t *testing.T) index.Index {
v1, err := os.Open("testdata/sample-v1-with-zero-len-section.car")
require.NoError(t, err)
defer v1.Close()
want, err := carv2.GenerateIndex(v1, carv2.ZeroLengthSectionAsEOF(true))
require.NoError(t, err)
return want
},
false,
},
{
"AnotherCarV1WithZeroLenSectionIsGeneratedAsExpected",
"testdata/sample-v1-with-zero-len-section2.car",
[]carv2.Option{carv2.ZeroLengthSectionAsEOF(true)},
func(t *testing.T) index.Index {
name: "AnotherCarV1WithZeroLenSectionIsGeneratedAsExpected",
carPath: "testdata/sample-v1-with-zero-len-section2.car",
opts: []carv2.Option{carv2.ZeroLengthSectionAsEOF(true)},
wantIndexer: func(t *testing.T) index.Index {
v1, err := os.Open("testdata/sample-v1-with-zero-len-section2.car")
require.NoError(t, err)
defer v1.Close()
want, err := carv2.GenerateIndex(v1, carv2.ZeroLengthSectionAsEOF(true))
require.NoError(t, err)
return want
},
false,
},
{
"CarV1WithZeroLenSectionWithoutOptionIsError",
"testdata/sample-v1-with-zero-len-section.car",
[]carv2.Option{},
func(t *testing.T) index.Index { return nil },
true,
name: "CarV1WithZeroLenSectionWithoutOptionIsError",
carPath: "testdata/sample-v1-with-zero-len-section.car",
wantErr: true,
},
{
"CarOtherThanV1OrV2IsError",
"testdata/sample-rootless-v42.car",
[]carv2.Option{},
func(t *testing.T) index.Index { return nil },
true,
name: "CarOtherThanV1OrV2IsError",
carPath: "testdata/sample-rootless-v42.car",
wantIndexer: func(t *testing.T) index.Index { return nil },
wantErr: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
t.Run("ReadOrGenerateIndex_"+tt.name, func(t *testing.T) {
carFile, err := os.Open(tt.carPath)
require.NoError(t, err)
t.Cleanup(func() { assert.NoError(t, carFile.Close()) })
Expand All @@ -108,54 +99,55 @@ func TestReadOrGenerateIndex(t *testing.T) {
require.Error(t, err)
} else {
require.NoError(t, err)
want := tt.wantIndexer(t)
var want index.Index
if tt.wantIndexer != nil {
want = tt.wantIndexer(t)
}
require.Equal(t, want, got)
}
})
}
}

func TestGenerateIndexFromFile(t *testing.T) {
tests := []struct {
name string
carPath string
wantIndexer func(t *testing.T) index.Index
wantErr bool
}{
{
"CarV1IsIndexedAsExpected",
"testdata/sample-v1.car",
func(t *testing.T) index.Index {
v1, err := os.Open("testdata/sample-v1.car")
t.Run("GenerateIndexFromFile_"+tt.name, func(t *testing.T) {
got, err := carv2.GenerateIndexFromFile(tt.carPath, tt.opts...)
if tt.wantErr {
require.Error(t, err)
} else {
require.NoError(t, err)
defer v1.Close()
want, err := carv2.GenerateIndex(v1)
var want index.Index
if tt.wantIndexer != nil {
want = tt.wantIndexer(t)
}
require.Equal(t, want, got)
}
})
t.Run("LoadIndex_"+tt.name, func(t *testing.T) {
carFile, err := os.Open(tt.carPath)
require.NoError(t, err)
got, err := index.New(multicodec.CarMultihashIndexSorted)
require.NoError(t, err)
err = carv2.LoadIndex(got, carFile, tt.opts...)
if tt.wantErr {
require.Error(t, err)
} else {
require.NoError(t, err)
return want
},
false,
},
{
"CarV2IsErrorSinceOnlyV1PayloadIsExpected",
"testdata/sample-wrapped-v2.car",
func(t *testing.T) index.Index { return nil },
true,
},
{
"CarOtherThanV1OrV2IsError",
"testdata/sample-rootless-v42.car",
func(t *testing.T) index.Index { return nil },
true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got, err := carv2.GenerateIndexFromFile(tt.carPath)
var want index.Index
if tt.wantIndexer != nil {
want = tt.wantIndexer(t)
}
require.Equal(t, want, got)
}
})
t.Run("GenerateIndex_"+tt.name, func(t *testing.T) {
carFile, err := os.Open(tt.carPath)
require.NoError(t, err)
got, err := carv2.GenerateIndex(carFile, tt.opts...)
if tt.wantErr {
require.Error(t, err)
} else {
require.NoError(t, err)
want := tt.wantIndexer(t)
var want index.Index
if tt.wantIndexer != nil {
want = tt.wantIndexer(t)
}
require.Equal(t, want, got)
}
})
Expand Down

0 comments on commit 7ba9372

Please sign in to comment.