Skip to content

Commit dbe10b6

Browse files
committed
plumbing: format/index, Align V2/V3 long name and V4 prefix encoding with Git
The V2/V3 decoder truncated entry names longer than 4095 bytes because it trusted the 12-bit length field. Git falls back to strlen (scanning for the NUL terminator) when the field is saturated at 0xFFF; do the same here so long paths round-trip correctly. The V4 encoder used a path.Dir heuristic for prefix compression, which diverges from Git's byte-level longest-common-prefix algorithm. Replace it with a common-prefix computation so the output matches what upstream produces. Signed-off-by: Paulo Gomes <[email protected]> Assisted-by: Claude Opus 4.6 <[email protected]>
1 parent e9b65df commit dbe10b6

4 files changed

Lines changed: 293 additions & 49 deletions

File tree

plumbing/format/index/decoder.go

Lines changed: 47 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -142,33 +142,55 @@ func (d *Decoder) readEntry(idx *Index) (*Entry, error) {
142142
e.SkipWorktree = extended&skipWorkTreeMask != 0
143143
}
144144

145-
if err := d.readEntryName(idx, e, flags); err != nil {
145+
nameConsumed, err := d.readEntryName(idx, e, flags)
146+
if err != nil {
146147
return nil, err
147148
}
148149

149-
return e, d.padEntry(idx, e, read)
150+
return e, d.padEntry(idx, e, read, nameConsumed)
150151
}
151152

152-
func (d *Decoder) readEntryName(idx *Index, e *Entry, flags uint16) error {
153-
var name string
154-
var err error
155-
153+
// readEntryName reads the entry path and sets e.Name. It returns the
154+
// number of bytes consumed from the stream for the name portion.
155+
func (d *Decoder) readEntryName(idx *Index, e *Entry, flags uint16) (int, error) {
156156
switch idx.Version {
157157
case 2, 3:
158-
len := flags & nameMask
159-
name, err = d.doReadEntryName(len)
158+
nameLen := flags & nameMask
159+
name, consumed, err := d.doReadEntryName(nameLen)
160+
if err != nil {
161+
return 0, err
162+
}
163+
e.Name = name
164+
return consumed, nil
160165
case 4:
161-
name, err = d.doReadEntryNameV4()
166+
name, err := d.doReadEntryNameV4()
167+
if err != nil {
168+
return 0, err
169+
}
170+
e.Name = name
171+
return 0, nil // V4 has no padding; consumed count unused
162172
default:
163-
return ErrUnsupportedVersion
173+
return 0, ErrUnsupportedVersion
164174
}
175+
}
165176

166-
if err != nil {
167-
return err
177+
// doReadEntryName reads the entry path for V2/V3 indexes. It returns the
178+
// name, the number of bytes consumed from the stream, and any error.
179+
// When nameLen equals nameMask (0xFFF), the name was too long to fit in
180+
// the 12-bit field and the real length is found by scanning for the NUL
181+
// terminator — matching C Git's strlen(name) fallback in create_from_disk.
182+
func (d *Decoder) doReadEntryName(nameLen uint16) (string, int, error) {
183+
if nameLen == nameMask {
184+
name, err := binary.ReadUntil(d.r, '\x00')
185+
if err != nil {
186+
return "", 0, err
187+
}
188+
return string(name), len(name) + 1, nil // +1 for the consumed NUL delimiter
168189
}
169190

170-
e.Name = name
171-
return nil
191+
name := make([]byte, nameLen)
192+
_, err := io.ReadFull(d.r, name)
193+
return string(name), int(nameLen), err
172194
}
173195

174196
func (d *Decoder) doReadEntryNameV4() (string, error) {
@@ -197,24 +219,23 @@ func (d *Decoder) doReadEntryNameV4() (string, error) {
197219
return base + string(name), nil
198220
}
199221

200-
func (d *Decoder) doReadEntryName(len uint16) (string, error) {
201-
name := make([]byte, len)
202-
_, err := io.ReadFull(d.r, name)
203-
204-
return string(name), err
205-
}
206-
207-
// Index entries are padded out to the next 8 byte alignment
208-
// for historical reasons related to how C Git read the files.
209-
func (d *Decoder) padEntry(idx *Index, e *Entry, read int) error {
222+
// padEntry discards NUL padding bytes that follow each V2/V3 entry on
223+
// disk. nameConsumed is the number of stream bytes consumed while reading
224+
// the entry name (which may exceed len(e.Name) when a NUL terminator was
225+
// consumed for long names where the 12-bit length field overflowed).
226+
func (d *Decoder) padEntry(idx *Index, e *Entry, read, nameConsumed int) error {
210227
if idx.Version == 4 {
211228
return nil
212229
}
213230

214231
entrySize := read + len(e.Name)
215232
padLen := 8 - entrySize%8
216-
_, err := io.CopyN(io.Discard, d.r, int64(padLen))
217-
return err
233+
padLen -= nameConsumed - len(e.Name)
234+
if padLen > 0 {
235+
_, err := io.CopyN(io.Discard, d.r, int64(padLen))
236+
return err
237+
}
238+
return nil
218239
}
219240

220241
func (d *Decoder) readExtensions(idx *Index) error {

plumbing/format/index/decoder_test.go

Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,11 @@ package index
33
import (
44
"bytes"
55
"crypto"
6+
"errors"
7+
"fmt"
68
"io"
9+
"os"
10+
"strings"
711
"testing"
812
"time"
913

@@ -429,3 +433,168 @@ func TestDecodeV4StripLength(t *testing.T) {
429433
})
430434
}
431435
}
436+
437+
func TestDecodeNameLength0xFFF(t *testing.T) {
438+
t.Parallel()
439+
440+
tests := []struct {
441+
name string
442+
version uint32
443+
names []string // must be in sorted order
444+
}{
445+
{
446+
name: "V2/SHA1: name 4094 bytes (below 0xFFF, fixed-length read)",
447+
version: 2,
448+
names: []string{strings.Repeat("x", 4094)},
449+
},
450+
{
451+
name: "V2/SHA1: name 4095 bytes (== 0xFFF, triggers NUL scan)",
452+
version: 2,
453+
names: []string{strings.Repeat("x", 4095)},
454+
},
455+
{
456+
name: "V2/SHA1: name 4096 bytes (just above 0xFFF)",
457+
version: 2,
458+
names: []string{strings.Repeat("x", 4096)},
459+
},
460+
{
461+
name: "V2/SHA1: name 5000 bytes (well above 0xFFF)",
462+
version: 2,
463+
names: []string{strings.Repeat("x", 5000)},
464+
},
465+
{
466+
name: "V2/SHA1: long name then short name",
467+
version: 2,
468+
names: []string{strings.Repeat("a", 5000), "zzz"},
469+
},
470+
{
471+
name: "V2/SHA1: short name then long name",
472+
version: 2,
473+
names: []string{"aaa", strings.Repeat("z", 5000)},
474+
},
475+
{
476+
name: "V3/SHA1: name 4095 bytes",
477+
version: 3,
478+
names: []string{strings.Repeat("x", 4095)},
479+
},
480+
}
481+
482+
for _, tc := range tests {
483+
t.Run(tc.name, func(t *testing.T) {
484+
t.Parallel()
485+
486+
entries := make([]*Entry, len(tc.names))
487+
for i, name := range tc.names {
488+
var eh plumbing.Hash
489+
entries[i] = &Entry{
490+
CreatedAt: time.Now(),
491+
ModifiedAt: time.Now(),
492+
Name: name,
493+
Hash: eh,
494+
Size: uint32(i + 1),
495+
}
496+
}
497+
498+
idx := &Index{Version: tc.version, Entries: entries}
499+
500+
buf := bytes.NewBuffer(nil)
501+
err := NewEncoder(buf).Encode(idx)
502+
require.NoError(t, err)
503+
504+
output := &Index{}
505+
err = NewDecoder(bytes.NewReader(buf.Bytes())).Decode(output)
506+
require.NoError(t, err)
507+
508+
require.Len(t, output.Entries, len(tc.names))
509+
for i, name := range tc.names {
510+
assert.Equal(t, name, output.Entries[i].Name,
511+
"entry %d name mismatch", i)
512+
}
513+
})
514+
}
515+
}
516+
517+
// TestDecodeNameLength0xFFFPatchedFlags verifies the decoder's NUL-scan
518+
// fallback by patching a short entry's flags to 0xFFF. The decoder must
519+
// recover the correct name by scanning for the NUL terminator in the
520+
// padding bytes, matching C Git's strlen(name) fallback.
521+
func TestDecodeNameLength0xFFFPatchedFlags(t *testing.T) {
522+
t.Parallel()
523+
524+
var eh plumbing.Hash
525+
idx := &Index{
526+
Version: 2,
527+
Entries: []*Entry{{
528+
CreatedAt: time.Now(),
529+
ModifiedAt: time.Now(),
530+
Name: "hello",
531+
Hash: eh,
532+
Size: 42,
533+
}},
534+
}
535+
536+
buf := bytes.NewBuffer(nil)
537+
err := NewEncoder(buf).Encode(idx)
538+
require.NoError(t, err)
539+
540+
raw := buf.Bytes()
541+
542+
// Flags are at: 12 (header) + 40 (10×uint32) + hashSize.
543+
hashSize := crypto.SHA1.Size()
544+
flagsOff := 12 + 40 + hashSize
545+
origFlags := uint16(raw[flagsOff])<<8 | uint16(raw[flagsOff+1])
546+
require.Equal(t, uint16(len("hello")), origFlags&nameMask,
547+
"original flags should encode the actual name length")
548+
549+
// Overwrite the lower 12 bits to 0xFFF, forcing the NUL-scan path.
550+
raw[flagsOff] = (raw[flagsOff] & 0xF0) | 0x0F
551+
raw[flagsOff+1] = 0xFF
552+
553+
// Recompute the trailing checksum over the modified content.
554+
h := crypto.SHA1.New()
555+
h.Write(raw[:len(raw)-hashSize])
556+
copy(raw[len(raw)-hashSize:], h.Sum(nil))
557+
558+
output := &Index{}
559+
err = NewDecoder(bytes.NewReader(raw)).Decode(output)
560+
require.NoError(t, err)
561+
562+
require.Len(t, output.Entries, 1)
563+
assert.Equal(t, "hello", output.Entries[0].Name)
564+
assert.Equal(t, uint32(42), output.Entries[0].Size)
565+
}
566+
567+
func TestDecodeAllIndexFixtures(t *testing.T) {
568+
t.Parallel()
569+
570+
fix := fixtures.ByTag(".git")
571+
require.NotEmpty(t, fix)
572+
573+
want := map[uint32]struct{}{
574+
2: {},
575+
3: {},
576+
4: {},
577+
}
578+
got := map[uint32]struct{}{}
579+
580+
for i, f := range fix { //nolint: paralleltest // breaks fixtures
581+
t.Run(fmt.Sprint(i), func(t *testing.T) {
582+
f, err := f.DotGit().Open("index")
583+
if errors.Is(err, os.ErrNotExist) {
584+
return
585+
}
586+
587+
require.NoError(t, err)
588+
defer func() { require.NoError(t, f.Close()) }()
589+
590+
idx := &Index{}
591+
d := NewDecoder(f)
592+
err = d.Decode(idx)
593+
require.NoError(t, err)
594+
595+
got[idx.Version] = struct{}{}
596+
})
597+
}
598+
599+
assert.Equal(t, want, got, "not all wanted index versions found")
600+
}

plumbing/format/index/encoder.go

Lines changed: 25 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,7 @@ import (
55
"errors"
66
"fmt"
77
"io"
8-
"path"
98
"sort"
10-
"strings"
119
"time"
1210

1311
"github.com/go-git/go-git/v5/plumbing/hash"
@@ -160,26 +158,39 @@ func (e *Encoder) encodeEntryName(entry *Entry) error {
160158
}
161159

162160
func (e *Encoder) encodeEntryNameV4(entry *Entry) error {
163-
name := entry.Name
164-
l := 0
161+
// V4 prefix compression: find the longest common prefix between the
162+
// previous entry's name and the current one. The strip length tells
163+
// the decoder how many bytes to remove from the end of the previous
164+
// name, and the suffix is the remainder of the current name.
165+
prefix := 0
165166
if e.lastEntry != nil {
166-
dir := path.Dir(e.lastEntry.Name) + "/"
167-
if strings.HasPrefix(entry.Name, dir) {
168-
l = len(e.lastEntry.Name) - len(dir)
169-
name = strings.TrimPrefix(entry.Name, dir)
170-
} else {
171-
l = len(e.lastEntry.Name)
172-
}
167+
prefix = commonPrefixLen(e.lastEntry.Name, entry.Name)
168+
}
169+
stripLen := 0
170+
if e.lastEntry != nil {
171+
stripLen = len(e.lastEntry.Name) - prefix
173172
}
174173

175174
e.lastEntry = entry
176175

177-
err := binary.WriteVariableWidthInt(e.w, int64(l))
178-
if err != nil {
176+
if err := binary.WriteVariableWidthInt(e.w, int64(stripLen)); err != nil {
179177
return err
180178
}
181179

182-
return binary.Write(e.w, []byte(name+string('\x00')))
180+
suffix := entry.Name[prefix:]
181+
return binary.Write(e.w, append([]byte(suffix), '\x00'))
182+
}
183+
184+
// commonPrefixLen returns the length of the longest common byte prefix
185+
// between a and b.
186+
func commonPrefixLen(a, b string) int {
187+
n := min(len(b), len(a))
188+
for i := range n {
189+
if a[i] != b[i] {
190+
return i
191+
}
192+
}
193+
return n
183194
}
184195

185196
func (e *Encoder) encodeRawExtension(signature string, data []byte) error {

0 commit comments

Comments
 (0)