Skip to content

Commit 878a320

Browse files
committed
Better error recovery in devmapper
Signed-off-by: Maksym Pavlenko <[email protected]>
1 parent eabb536 commit 878a320

4 files changed

Lines changed: 157 additions & 42 deletions

File tree

snapshots/devmapper/device_info.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,8 @@ const (
5656
Removing
5757
// Removed means that device successfully removed but not yet deleted from meta store
5858
Removed
59+
// Faulty means that the device is errored and the snapshotter failed to rollback it
60+
Faulty
5961
)
6062

6163
func (s DeviceState) String() string {
@@ -84,6 +86,8 @@ func (s DeviceState) String() string {
8486
return "Removing"
8587
case Removed:
8688
return "Removed"
89+
case Faulty:
90+
return "Faulty"
8791
default:
8892
return fmt.Sprintf("unknown %d", s)
8993
}

snapshots/devmapper/metadata.go

Lines changed: 38 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424
"fmt"
2525
"strconv"
2626

27+
"github.com/hashicorp/go-multierror"
2728
"github.com/pkg/errors"
2829
bolt "go.etcd.io/bbolt"
2930
)
@@ -38,6 +39,7 @@ type deviceIDState byte
3839
const (
3940
deviceFree deviceIDState = iota
4041
deviceTaken
42+
deviceFaulty
4143
)
4244

4345
// Bucket names
@@ -92,11 +94,14 @@ func (m *PoolMetadata) ensureDatabaseInitialized() error {
9294

9395
// AddDevice saves device info to database.
9496
func (m *PoolMetadata) AddDevice(ctx context.Context, info *DeviceInfo) error {
95-
return m.db.Update(func(tx *bolt.Tx) error {
97+
err := m.db.Update(func(tx *bolt.Tx) error {
9698
devicesBucket := tx.Bucket(devicesBucketName)
9799

98-
// Make sure device name is unique
99-
if err := getObject(devicesBucket, info.Name, nil); err == nil {
100+
// Make sure device name is unique. If there is already a device with the same name,
101+
// but in Faulty state, give it a try with another devmapper device ID.
102+
// See https://github.com/containerd/containerd/pull/3436 for more context.
103+
var existing DeviceInfo
104+
if err := getObject(devicesBucket, info.Name, &existing); err == nil && existing.State != Faulty {
100105
return ErrAlreadyExists
101106
}
102107

@@ -108,8 +113,37 @@ func (m *PoolMetadata) AddDevice(ctx context.Context, info *DeviceInfo) error {
108113

109114
info.DeviceID = deviceID
110115

111-
return putObject(devicesBucket, info.Name, info, false)
116+
return putObject(devicesBucket, info.Name, info, true)
112117
})
118+
119+
if err != nil {
120+
return errors.Wrapf(err, "failed to save metadata for device %q (parent: %q)", info.Name, info.ParentName)
121+
}
122+
123+
return nil
124+
}
125+
126+
// MarkFaulty marks the given device and corresponding devmapper device ID as faulty.
127+
// The snapshotter might attempt to recreate a device in 'Faulty' state with another devmapper ID in
128+
// subsequent calls, and in case of success it's status will be changed to 'Created' or 'Activated'.
129+
// The devmapper dev ID will remain in 'deviceFaulty' state until manually handled by a user.
130+
func (m *PoolMetadata) MarkFaulty(ctx context.Context, info *DeviceInfo) error {
131+
var result error
132+
133+
if err := m.UpdateDevice(ctx, info.Name, func(deviceInfo *DeviceInfo) error {
134+
deviceInfo.State = Faulty
135+
return nil
136+
}); err != nil {
137+
result = multierror.Append(result, err)
138+
}
139+
140+
if err := m.db.Update(func(tx *bolt.Tx) error {
141+
return markDeviceID(tx, info.DeviceID, deviceFaulty)
142+
}); err != nil {
143+
result = multierror.Append(result, err)
144+
}
145+
146+
return result
113147
}
114148

115149
// getNextDeviceID finds the next free device ID by taking a cursor

snapshots/devmapper/metadata_test.go

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,11 @@ import (
2323
"io/ioutil"
2424
"os"
2525
"path/filepath"
26+
"strconv"
2627
"testing"
2728

29+
"github.com/pkg/errors"
30+
"go.etcd.io/bbolt"
2831
"gotest.tools/assert"
2932
is "gotest.tools/assert/cmp"
3033
)
@@ -77,7 +80,7 @@ func TestPoolMetadata_AddDeviceDuplicate(t *testing.T) {
7780
assert.NilError(t, err)
7881

7982
err = store.AddDevice(testCtx, &DeviceInfo{Name: "test"})
80-
assert.Equal(t, ErrAlreadyExists, err)
83+
assert.Equal(t, ErrAlreadyExists, errors.Cause(err))
8184
}
8285

8386
func TestPoolMetadata_ReuseDeviceID(t *testing.T) {
@@ -151,6 +154,33 @@ func TestPoolMetadata_UpdateDevice(t *testing.T) {
151154
assert.Equal(t, Created, newInfo.State)
152155
}
153156

157+
func TestPoolMetadata_MarkFaulty(t *testing.T) {
158+
tempDir, store := createStore(t)
159+
defer cleanupStore(t, tempDir, store)
160+
161+
info := &DeviceInfo{Name: "test"}
162+
err := store.AddDevice(testCtx, info)
163+
assert.NilError(t, err)
164+
165+
err = store.MarkFaulty(testCtx, info)
166+
assert.NilError(t, err)
167+
168+
saved, err := store.GetDevice(testCtx, info.Name)
169+
assert.NilError(t, err)
170+
assert.Equal(t, saved.State, Faulty)
171+
assert.Assert(t, saved.DeviceID > 0)
172+
173+
// Make sure a device ID marked as faulty as well
174+
err = store.db.View(func(tx *bbolt.Tx) error {
175+
bucket := tx.Bucket(deviceIDBucketName)
176+
key := strconv.FormatUint(uint64(saved.DeviceID), 10)
177+
value := bucket.Get([]byte(key))
178+
assert.Equal(t, value[0], byte(deviceFaulty))
179+
return nil
180+
})
181+
assert.NilError(t, err)
182+
}
183+
154184
func TestPoolMetadata_GetDeviceNames(t *testing.T) {
155185
tempDir, store := createStore(t)
156186
defer cleanupStore(t, tempDir, store)

snapshots/devmapper/pool_device.go

Lines changed: 84 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -118,35 +118,59 @@ func (p *PoolDevice) CreateThinDevice(ctx context.Context, deviceName string, vi
118118
State: Unknown,
119119
}
120120

121-
// Save initial device metadata and allocate new device ID from store
122-
if err := p.metadata.AddDevice(ctx, info); err != nil {
123-
return errors.Wrapf(err, "failed to save initial metadata for new thin device %q", deviceName)
124-
}
121+
var (
122+
metaErr error
123+
devErr error
124+
activeErr error
125+
)
125126

126127
defer func() {
127-
if retErr == nil {
128+
// We've created a devmapper device, but failed to activate it, try rollback everything
129+
if activeErr != nil {
130+
// Delete the device first.
131+
delErr := p.deleteDevice(ctx, info)
132+
if delErr != nil {
133+
// Failed to rollback, mark the device as faulty and keep metadata in order to
134+
// preserve the faulty device ID
135+
retErr = multierror.Append(retErr, delErr, p.metadata.MarkFaulty(ctx, info))
136+
return
137+
}
138+
139+
// The devmapper device has been successfully deleted, deallocate device ID
140+
if err := p.RemoveDevice(ctx, info.Name); err != nil {
141+
retErr = multierror.Append(retErr, err)
142+
return
143+
}
144+
128145
return
129146
}
130147

131-
// Rollback metadata
132-
retErr = multierror.Append(retErr, p.metadata.RemoveDevice(ctx, info.Name))
148+
// We're unable to create the devmapper device, most likely something wrong with the deviceID
149+
if devErr != nil {
150+
retErr = multierror.Append(retErr, p.metadata.MarkFaulty(ctx, info))
151+
return
152+
}
133153
}()
134154

135-
// Create thin device
136-
if err := p.createDevice(ctx, info); err != nil {
137-
return err
155+
// Save initial device metadata and allocate new device ID from store
156+
metaErr = p.metadata.AddDevice(ctx, info)
157+
if metaErr != nil {
158+
return metaErr
138159
}
139160

140-
defer func() {
141-
if retErr == nil {
142-
return
143-
}
161+
// Create thin device
162+
devErr = p.createDevice(ctx, info)
163+
if devErr != nil {
164+
return devErr
165+
}
144166

145-
// Rollback creation
146-
retErr = multierror.Append(retErr, p.deleteDevice(ctx, info))
147-
}()
167+
// Activate thin device
168+
activeErr = p.activateDevice(ctx, info)
169+
if activeErr != nil {
170+
return activeErr
171+
}
148172

149-
return p.activateDevice(ctx, info)
173+
return nil
150174
}
151175

152176
// createDevice creates thin device
@@ -185,36 +209,59 @@ func (p *PoolDevice) CreateSnapshotDevice(ctx context.Context, deviceName string
185209
State: Unknown,
186210
}
187211

188-
// Save snapshot metadata and allocate new device ID
189-
if err := p.metadata.AddDevice(ctx, snapInfo); err != nil {
190-
return errors.Wrapf(err, "failed to save initial metadata for snapshot %q", snapshotName)
191-
}
212+
var (
213+
metaErr error
214+
devErr error
215+
activeErr error
216+
)
192217

193218
defer func() {
194-
if retErr == nil {
219+
// We've created a devmapper device, but failed to activate it, try rollback everything
220+
if activeErr != nil {
221+
// Delete the device first.
222+
delErr := p.deleteDevice(ctx, snapInfo)
223+
if delErr != nil {
224+
// Failed to rollback, mark the device as faulty and keep metadata in order to
225+
// preserve the faulty device ID
226+
retErr = multierror.Append(retErr, delErr, p.metadata.MarkFaulty(ctx, snapInfo))
227+
return
228+
}
229+
230+
// The devmapper device has been successfully deleted, deallocate device ID
231+
if err := p.RemoveDevice(ctx, snapInfo.Name); err != nil {
232+
retErr = multierror.Append(retErr, err)
233+
return
234+
}
235+
195236
return
196237
}
197238

198-
// Rollback metadata
199-
retErr = multierror.Append(retErr, p.metadata.RemoveDevice(ctx, snapInfo.Name))
239+
// We're unable to create the devmapper device, most likely something wrong with the deviceID
240+
if devErr != nil {
241+
retErr = multierror.Append(retErr, p.metadata.MarkFaulty(ctx, snapInfo))
242+
return
243+
}
200244
}()
201245

202-
// Create thin device snapshot
203-
if err := p.createSnapshot(ctx, baseInfo, snapInfo); err != nil {
204-
return err
246+
// Save snapshot metadata and allocate new device ID
247+
metaErr = p.metadata.AddDevice(ctx, snapInfo)
248+
if metaErr != nil {
249+
return metaErr
205250
}
206251

207-
defer func() {
208-
if retErr == nil {
209-
return
210-
}
252+
// Create thin device snapshot
253+
devErr = p.createSnapshot(ctx, baseInfo, snapInfo)
254+
if devErr != nil {
255+
return devErr
256+
}
211257

212-
// Rollback snapshot creation
213-
retErr = multierror.Append(retErr, p.deleteDevice(ctx, snapInfo))
214-
}()
258+
// Activate the snapshot device
259+
activeErr = p.activateDevice(ctx, snapInfo)
260+
if activeErr != nil {
261+
return activeErr
262+
}
215263

216-
// Activate snapshot device
217-
return p.activateDevice(ctx, snapInfo)
264+
return nil
218265
}
219266

220267
func (p *PoolDevice) createSnapshot(ctx context.Context, baseInfo, snapInfo *DeviceInfo) error {

0 commit comments

Comments
 (0)