Skip to content

Commit 7f69572

Browse files
ross-spencerrichardlehane
authored andcommitted
Squashed commit of the following:
commit 64bf4da Author: Richard Lehane <[email protected]> Date: Mon Mar 20 15:12:56 2023 +0000 miscellaneous edit to prompt a merge check commit 706209d Merge: dcb15c2 eb6f061 Author: Richard Lehane <[email protected]> Date: Mon Mar 20 12:42:20 2023 +0100 Merge branch 'develop' into dev/add-pronom-type commit dcb15c2 Author: Richard Lehane <[email protected]> Date: Mon Mar 20 08:12:50 2023 +0100 fix indexes used by droid writer commit c95e02d Author: Richard Lehane <[email protected]> Date: Sun Mar 19 22:58:45 2023 +0100 add "noclass" flag to allow omitting format class commit b958528 Author: Richard Lehane <[email protected]> Date: Sun Mar 19 13:22:00 2023 +0100 use Limit commit 957c2e7 Author: Ross Spencer <[email protected]> Date: Sun Feb 5 21:07:14 2023 +0100 Add test for DROID CSV header output Ensures that the DROID header doesn't change in code unless it is explicitly made to do so. commit 9f94a77 Author: Ross Spencer <[email protected]> Date: Wed Jan 4 16:50:29 2023 +0100 Create in-memory filesystem for PRONOM skeletons We can avoid writing to disk and make the tests here more portable by reading from an in-memory filesystem. The skeletons themselves are small and so can be easily stored in-line as strings and then turned into byte objects. Given the refactor to in-memory objects, we also take the opportunity to add a file that won't identify with the minimal PRONOM signature file and PRONOM reports. Type should be a nil-string as with many of the other fields. commit e27bb70 Author: Ross Spencer <[email protected]> Date: Wed Dec 28 12:47:55 2022 +0100 Linting fixes PRONOM identifier related linting fixes for the different source files touched by the PRONOM types additions. commit 2bdc899 Author: Ross Spencer <[email protected]> Date: Tue Dec 27 18:33:22 2022 +0100 Add tests for PRONOM types work Tests are added for the PRONOM types work along with new helper functions for making Siegfried tests more discrete and maintainable. commit 0b02110 Author: Ross Spencer <[email protected]> Date: Tue Dec 27 17:45:28 2022 +0100 Add format type to Siegfried PRONOM output
1 parent ad0006c commit 7f69572

15 files changed

+520
-68
lines changed

cmd/roy/roy.go

+4
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,7 @@ var (
146146
noxml = build.Bool("noxml", false, "skip XML matcher")
147147
noriff = build.Bool("noriff", false, "skip RIFF matcher")
148148
noreports = build.Bool("noreports", false, "build directly from DROID file rather than PRONOM reports")
149+
noclass = build.Bool("noclass", false, "omit format classes from the signature file")
149150
doubleup = build.Bool("doubleup", false, "include byte signatures for formats that also have container signatures")
150151
rng = build.Int("range", config.Range(), "define a maximum range for segmentation")
151152
distance = build.Int("distance", config.Distance(), "define a maximum distance for segmentation")
@@ -416,6 +417,9 @@ the DROID signature file you should also include a regular signature extension
416417
if *noreports {
417418
opts = append(opts, config.SetNoReports())
418419
}
420+
if *noclass {
421+
opts = append(opts, config.SetNoClass())
422+
}
419423
if *doubleup {
420424
opts = append(opts, config.SetDoubleUp())
421425
}

cmd/sf/longpath.go

+3-3
Original file line numberDiff line numberDiff line change
@@ -34,10 +34,10 @@ func identify(ctxts chan *context, root, orig string, coerr, norecurse, droid bo
3434
}
3535
if err != nil {
3636
if coerr {
37-
printFile(ctxts, gf(path, "", time.Time{}, 0), WalkError{path, err})
37+
printFile(ctxts, gf(path, "", time.Time{}, 0), walkError{path, err})
3838
return nil
3939
}
40-
return WalkError{path, err}
40+
return walkError{path, err}
4141
}
4242
if info.IsDir() {
4343
if norecurse && path != root {
@@ -50,7 +50,7 @@ func identify(ctxts chan *context, root, orig string, coerr, norecurse, droid bo
5050
}
5151
// zero user read permissions mask, octal 400 (decimal 256)
5252
if !info.Mode().IsRegular() || info.Mode()&256 == 0 {
53-
printFile(ctxts, gf(path, "", info.ModTime(), info.Size()), ModeError(info.Mode()))
53+
printFile(ctxts, gf(path, "", info.ModTime(), info.Size()), modeError(info.Mode()))
5454
return nil
5555
}
5656
identifyFile(gf(path, "", info.ModTime(), info.Size()), ctxts, gf)

cmd/sf/longpath_windows.go

+3-3
Original file line numberDiff line numberDiff line change
@@ -86,10 +86,10 @@ func identify(ctxts chan *context, root, orig string, coerr, norecurse, droid bo
8686
info, err = retryStat(path, err) // retry stat in case is a windows long path error
8787
if err != nil {
8888
if coerr {
89-
printFile(ctxts, gf(path, "", time.Time{}, 0), WalkError{path, err})
89+
printFile(ctxts, gf(path, "", time.Time{}, 0), walkError{path, err})
9090
return nil
9191
}
92-
return WalkError{path, err}
92+
return walkError{path, err}
9393
}
9494
lp, sp = longpath(path), path
9595
retry = true
@@ -107,7 +107,7 @@ func identify(ctxts chan *context, root, orig string, coerr, norecurse, droid bo
107107
return nil
108108
}
109109
if !info.Mode().IsRegular() {
110-
printFile(ctxts, gf(path, "", info.ModTime(), info.Size()), ModeError(info.Mode()))
110+
printFile(ctxts, gf(path, "", info.ModTime(), info.Size()), modeError(info.Mode()))
111111
return nil
112112
}
113113
identifyFile(gf(shortpath(path, orig), "", info.ModTime(), info.Size()), ctxts, gf)

cmd/sf/pronom_test.go

+186
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,186 @@
1+
package main
2+
3+
import (
4+
"encoding/hex"
5+
"path/filepath"
6+
"reflect"
7+
"sort"
8+
"testing"
9+
"testing/fstest"
10+
11+
"github.com/richardlehane/siegfried"
12+
"github.com/richardlehane/siegfried/pkg/config"
13+
"github.com/richardlehane/siegfried/pkg/pronom"
14+
)
15+
16+
var DataPath string = filepath.Join("..", "..", "cmd", "roy", "data")
17+
18+
// pronomIdentificationTests provides our structure for table driven tests.
19+
type pronomIdentificationTests struct {
20+
identiifer string
21+
puid string
22+
label string
23+
version string
24+
mime string
25+
types string
26+
details string
27+
error string
28+
}
29+
30+
var skeletons = make(map[string]*fstest.MapFile)
31+
32+
var minimalPronom = []string{"fmt/1", "fmt/3", "fmt/5", "fmt/11", "fmt/14"}
33+
34+
// Populate the global skeletons map from string-based byte-sequences to
35+
// save having to store skeletons on disk and read from them.
36+
func makeSkeletons() {
37+
var files = make(map[string]string)
38+
files["fmt-11-signature-id-58.png"] = "89504e470d0a1a0a0000000d494844520000000049454e44ae426082"
39+
files["fmt-14-signature-id-123.pdf"] = "255044462d312e302525454f46"
40+
files["fmt-1-signature-id-1032.wav"] = ("" +
41+
"524946460000000057415645000000000000000000000000000000000000" +
42+
"000062657874000000000000000000000000000000000000000000000000" +
43+
"000000000000000000000000000000000000000000000000000000000000" +
44+
"000000000000000000000000000000000000000000000000000000000000" +
45+
"000000000000000000000000000000000000000000000000000000000000" +
46+
"000000000000000000000000000000000000000000000000000000000000" +
47+
"000000000000000000000000000000000000000000000000000000000000" +
48+
"000000000000000000000000000000000000000000000000000000000000" +
49+
"000000000000000000000000000000000000000000000000000000000000" +
50+
"000000000000000000000000000000000000000000000000000000000000" +
51+
"000000000000000000000000000000000000000000000000000000000000" +
52+
"000000000000000000000000000000000000000000000000000000000000" +
53+
"00000000000000000000000000000000000000000000000000000000" +
54+
"")
55+
files["fmt-5-signature-id-51.avi"] = ("" +
56+
"524946460000000041564920000000000000000000000000000000000000" +
57+
"00004c495354000000006864726c61766968000000000000000000000000" +
58+
"00000000000000004c495354000000006d6f7669" +
59+
"")
60+
files["fmt-3-signature-id-18.gif"] = "4749463837613b"
61+
files["badf00d.unknown"] = "badf00d"
62+
for key, val := range files {
63+
data, _ := hex.DecodeString(val)
64+
skeletons[key] = &fstest.MapFile{Data: []byte(data)}
65+
}
66+
}
67+
68+
var pronomIDs = []pronomIdentificationTests{
69+
{
70+
"pronom",
71+
"UNKNOWN",
72+
"",
73+
"",
74+
"",
75+
"",
76+
"",
77+
"no match",
78+
},
79+
{
80+
"pronom",
81+
"fmt/1",
82+
"Broadcast WAVE",
83+
"0 Generic",
84+
"audio/x-wav",
85+
"Audio",
86+
"extension match wav; byte match at [[0 12] [32 356]]",
87+
"",
88+
},
89+
{
90+
"pronom",
91+
"fmt/11",
92+
"Portable Network Graphics",
93+
"1.0",
94+
"image/png",
95+
"Image (Raster)",
96+
"extension match png; byte match at [[0 16] [16 12]]",
97+
"",
98+
},
99+
{
100+
"pronom",
101+
"fmt/14",
102+
"Acrobat PDF 1.0 - Portable Document Format",
103+
"1.0",
104+
"application/pdf",
105+
"Page Description",
106+
"extension match pdf; byte match at [[0 8] [8 5]]",
107+
"",
108+
},
109+
{
110+
"pronom",
111+
"fmt/3",
112+
"Graphics Interchange Format",
113+
"87a",
114+
"image/gif",
115+
"Image (Raster)",
116+
"extension match gif; byte match at [[0 6] [6 1]]",
117+
"",
118+
},
119+
{
120+
"pronom",
121+
"fmt/5",
122+
"Audio/Video Interleaved Format",
123+
"",
124+
"video/x-msvideo",
125+
"Audio, Video",
126+
"extension match avi; byte match at [[0 12] [32 16] [68 12]]",
127+
"",
128+
},
129+
}
130+
131+
// TestPronom looks to see if PRONOM identification results for a
132+
// minimized PRONOM dataset are correct and contain the information we
133+
// anticipate.
134+
func TestPronom(t *testing.T) {
135+
sf := siegfried.New()
136+
config.SetHome(DataPath)
137+
identifier, err := pronom.New(config.SetLimit(minimalPronom))
138+
if err != nil {
139+
t.Errorf("Error creating new PRONOM identifier: %s", err)
140+
}
141+
sf.Add(identifier)
142+
makeSkeletons()
143+
skeletonFS := fstest.MapFS(skeletons)
144+
testDirListing, err := skeletonFS.ReadDir(".")
145+
if err != nil {
146+
t.Fatalf("Error reading test files directory: %s", err)
147+
}
148+
const resultLen int = 8
149+
results := make([]pronomIdentificationTests, 0)
150+
for _, val := range testDirListing {
151+
testFilePath := filepath.Join(".", val.Name())
152+
reader, _ := skeletonFS.Open(val.Name())
153+
res, _ := sf.Identify(reader, testFilePath, "")
154+
result := res[0].Values()
155+
if len(result) != resultLen {
156+
t.Errorf("Result len: %d not %d", len(result), resultLen)
157+
}
158+
idResult := pronomIdentificationTests{
159+
result[0], // identifier
160+
result[1], // PUID
161+
result[2], // label
162+
result[3], // version
163+
result[4], // mime
164+
result[5], // types
165+
result[6], // details
166+
result[7], // error
167+
}
168+
results = append(results, idResult)
169+
}
170+
// Sort expected results and received results to make them
171+
// comparable.
172+
sort.Slice(pronomIDs, func(i, j int) bool {
173+
return pronomIDs[i].puid < pronomIDs[j].puid
174+
})
175+
sort.Slice(results, func(i, j int) bool {
176+
return results[i].puid < results[j].puid
177+
})
178+
// Compare results on a result by result basis.
179+
for idx, res := range results {
180+
//t.Error(res)
181+
if !reflect.DeepEqual(res, pronomIDs[idx]) {
182+
t.Errorf("Results not equal for %s; expected %v; got %v", res.puid, pronomIDs[idx], res)
183+
}
184+
}
185+
config.Clear()()
186+
}

cmd/sf/serve.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,7 @@ func handleIdentify(w http.ResponseWriter, r *http.Request, s *siegfried.Siegfri
221221
err = identify(ctxts, path, "", coerr, nrec, d, gf)
222222
wg.Wait()
223223
wr.Tail()
224-
if _, ok := err.(WalkError); ok { // only dump out walk errors, other errors reported in result
224+
if _, ok := err.(walkError); ok { // only dump out walk errors, other errors reported in result
225225
io.WriteString(w, err.Error())
226226
}
227227
}

cmd/sf/sf.go

+5-5
Original file line numberDiff line numberDiff line change
@@ -74,9 +74,9 @@ var (
7474
ctxPool *sync.Pool
7575
)
7676

77-
type ModeError os.FileMode
77+
type modeError os.FileMode
7878

79-
func (me ModeError) Error() string {
79+
func (me modeError) Error() string {
8080
typ := "unknown"
8181
switch {
8282
case os.FileMode(me)&os.ModeDir == os.ModeDir:
@@ -95,12 +95,12 @@ func (me ModeError) Error() string {
9595
return fmt.Sprintf("file is of type %s; only regular files can be scanned", typ)
9696
}
9797

98-
type WalkError struct {
98+
type walkError struct {
9999
path string
100100
err error
101101
}
102102

103-
func (we WalkError) Error() string {
103+
func (we walkError) Error() string {
104104
return fmt.Sprintf("[FATAL] file access error for %s: %v", we.path, we.err)
105105
}
106106

@@ -432,7 +432,7 @@ func main() {
432432
case *jsono:
433433
w = writer.JSON(os.Stdout)
434434
case *droido:
435-
if !*replay && (len(s.Fields()) != 1 || len(s.Fields()[0]) != 7) {
435+
if !*replay && (len(s.Fields()) != 1 || len(s.Fields()[0]) < 7) {
436436
close(ctxts)
437437
log.Fatalln("[FATAL] DROID output is limited to signature files with a single PRONOM identifier")
438438
}

pkg/config/identifier.go

+2
Original file line numberDiff line numberDiff line change
@@ -318,6 +318,8 @@ func Clear() func() private {
318318
return func() private {
319319
identifier.name = ""
320320
identifier.extend = nil
321+
identifier.limit = nil
322+
identifier.exclude = nil
321323
loc.fdd = ""
322324
mimeinfo.mi = ""
323325
return private{}

pkg/config/pronom.go

+19-3
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ var pronom = struct {
2929
droid string // name of droid file e.g. DROID_SignatureFile_V78.xml
3030
container string // e.g. container-signature-19770502.xml
3131
reports string // directory where PRONOM reports are stored
32+
noclass bool // omit class from the format info
3233
doubleup bool // include byte signatures for formats that also have container signatures
3334
extendc []string //container extensions
3435
changesURL string
@@ -63,7 +64,7 @@ var pronom = struct {
6364

6465
// GETTERS
6566

66-
// DROID returns the location of the DROID signature file.
67+
// Droid returns the location of the DROID signature file.
6768
// If not set, infers the latest file.
6869
func Droid() string {
6970
if pronom.droid == "" {
@@ -79,7 +80,7 @@ func Droid() string {
7980
return pronom.droid
8081
}
8182

82-
// DROID base returns the base filename of the DROID signature file.
83+
// DroidBase returns the base filename of the DROID signature file.
8384
// If not set, infers the latest file.
8485
func DroidBase() string {
8586
if pronom.droid == "" {
@@ -163,6 +164,11 @@ func Reports() string {
163164
return filepath.Join(siegfried.home, pronom.reports)
164165
}
165166

167+
// NoClass reports whether the noclass flag has been set. This will cause class to be omitted from format infos
168+
func NoClass() bool {
169+
return pronom.noclass
170+
}
171+
166172
// DoubleUp reports whether the doubleup flag has been set. This will cause byte signatures to be built for formats where container signatures are also provided.
167173
func DoubleUp() bool {
168174
return pronom.doubleup
@@ -173,11 +179,12 @@ func ExcludeDoubles(puids, cont []string) []string {
173179
return exclude(puids, cont)
174180
}
175181

176-
// Extend reports whether a set of container signature extensions has been provided.
182+
// ExtendC reports whether a set of container signature extensions has been provided.
177183
func ExtendC() []string {
178184
return extensionPaths(pronom.extendc)
179185
}
180186

187+
// ChangesURL returns the URL for the PRONOM release notes.
181188
func ChangesURL() string {
182189
return pronom.changesURL
183190
}
@@ -225,6 +232,14 @@ func SetNoReports() func() private {
225232
}
226233
}
227234

235+
// SetNoClass causes class to be omitted from the format info
236+
func SetNoClass() func() private {
237+
return func() private {
238+
pronom.noclass = true
239+
return private{}
240+
}
241+
}
242+
228243
// SetDoubleUp causes byte signatures to be built for formats where container signatures are also provided.
229244
func SetDoubleUp() func() private {
230245
return func() private {
@@ -248,6 +263,7 @@ func SetHarvestTimeout(d time.Duration) {
248263
pronom.harvestTimeout = d
249264
}
250265

266+
// SetHarvestThrottle sets a throttle value for downloading DROID reports.
251267
func SetHarvestThrottle(d time.Duration) {
252268
pronom.harvestThrottle = d
253269
}

pkg/mimeinfo/identifier.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,7 @@ func (r *Recorder) Satisfied(mt core.MatcherType) (bool, core.Hint) {
190190
sort.Sort(r.ids)
191191
if len(r.ids) > 0 && (r.ids[0].xmlMatch || (r.ids[0].magicScore > 0 && r.ids[0].ID != config.TextMIME())) {
192192
if mt == core.ByteMatcher {
193-
return true, core.Hint{r.Start(mt), nil}
193+
return true, core.Hint{Exclude: r.Start(mt), Pivot: nil}
194194
}
195195
return true, core.Hint{}
196196
}

0 commit comments

Comments
 (0)