Skip to content

Commit 01f9c38

Browse files
Make media downloads are domain-specific (#247)
* Make media downloads are domain-specific * Ignore reduxer/sites * Update CHANGELOG.md * Remove init.go * Unset specific env to avoid httptest server hangs * Fix golangci linter * Add more tests
1 parent c01dfdf commit 01f9c38

9 files changed

+239
-4
lines changed

.licenserc.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ header:
2929
- '**/go.mod'
3030
- '**/go.sum'
3131
- 'LICENSE'
32+
- 'reduxer/sites'
3233
- 'template/assets/**'
3334
- 'template/views/*.html'
3435
- 'wayback.1'

CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1212
- Replace set-output with recommended env output ([#234](https://github.com/wabarc/wayback/pull/234))
1313
- Create deployment instructions for Render ([#236](https://github.com/wabarc/wayback/pull/236))
1414
- Specify dependencies for the distribution package ([#243](https://github.com/wabarc/wayback/pull/243))
15+
- Make media downloads are domain-specific ([#247](https://github.com/wabarc/wayback/pull/247))
1516

1617
## [0.18.1] - 2022-10-30
1718

README.md

+1
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,7 @@ You can also specify configuration options either via command flags or via envir
218218
| - | `WAYBACK_BOLT_PATH` | `./wayback.db` | File path of bolt database |
219219
| - | `WAYBACK_STORAGE_DIR` | - | Directory to store binary file, e.g. PDF, html file |
220220
| - | `WAYBACK_MAX_MEDIA_SIZE` | `512MB` | Max size to limit download stream media |
221+
| - | `WAYBACK_MEDIA_SITES` | - | Extra media websites wish to be supported, separate with comma |
221222
| - | `WAYBACK_TIMEOUT` | `300` | Timeout for single wayback request, defaults to 300 second |
222223
| - | `WAYBACK_MAX_RETRIES` | `2` | Max retries for single wayback request, defaults to 2 |
223224
| - | `WAYBACK_USERAGENT` | `WaybackArchiver/1.0` | User-Agent for a wayback request |

reduxer/init.go reduxer/media.go

+74-2
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,20 @@
1-
// Copyright 2021 Wayback Archiver. All rights reserved.
1+
// Copyright 2022 Wayback Archiver. All rights reserved.
22
// Use of this source code is governed by the GNU GPL v3
33
// license that can be found in the LICENSE file.
44

55
package reduxer // import "github.com/wabarc/wayback/reduxer"
66

7-
// Copied from https://github.com/iawia002/lux/blob/f1baf46e/app/register.go#L3-L40
87
import (
8+
"bufio"
9+
"embed"
10+
"net/url"
11+
"os"
12+
"strings"
13+
14+
"github.com/wabarc/logger"
15+
"golang.org/x/net/publicsuffix"
16+
17+
// Copied from https://github.com/iawia002/lux/blob/f1baf46e/app/register.go#L3-L40
918
_ "github.com/iawia002/lux/extractors/acfun"
1019
_ "github.com/iawia002/lux/extractors/bcy"
1120
_ "github.com/iawia002/lux/extractors/bilibili"
@@ -43,3 +52,66 @@ import (
4352
_ "github.com/iawia002/lux/extractors/youku"
4453
_ "github.com/iawia002/lux/extractors/youtube"
4554
)
55+
56+
const filename = "sites"
57+
58+
//go:embed sites
59+
var sites embed.FS
60+
61+
var managedMediaSites = make(map[string]struct{})
62+
63+
func init() {
64+
parseMediaSites(filename)
65+
}
66+
67+
func baseHost(u *url.URL) (string, error) {
68+
dom, err := publicsuffix.EffectiveTLDPlusOne(u.Hostname())
69+
if err != nil {
70+
return "", err
71+
}
72+
return dom, nil
73+
}
74+
75+
func parseMediaSites(fn string) {
76+
file, err := sites.Open(fn)
77+
if err != nil {
78+
return
79+
}
80+
defer file.Close()
81+
82+
scanner := bufio.NewScanner(file)
83+
for scanner.Scan() {
84+
host := strings.TrimSpace(scanner.Text())
85+
managedMediaSites[host] = struct{}{}
86+
}
87+
88+
// Combine extra sites
89+
extra := os.Getenv("WAYBACK_MEDIA_SITES")
90+
if len(extra) > 0 {
91+
for _, s := range strings.Split(extra, ",") {
92+
u, err := url.Parse(s)
93+
if err != nil {
94+
continue
95+
}
96+
dom, err := baseHost(u)
97+
if err != nil {
98+
continue
99+
}
100+
managedMediaSites[dom] = struct{}{}
101+
}
102+
}
103+
104+
if err := scanner.Err(); err != nil {
105+
logger.Warn("append managed media sites failed: %v", err)
106+
}
107+
}
108+
109+
func supportedMediaSite(u *url.URL) bool {
110+
dom, err := baseHost(u)
111+
if err != nil {
112+
return false
113+
}
114+
_, ok := managedMediaSites[dom]
115+
116+
return ok
117+
}

reduxer/media_test.go

+72
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
// Copyright 2022 Wayback Archiver. All rights reserved.
2+
// Use of this source code is governed by the GNU GPL v3
3+
// license that can be found in the LICENSE file.
4+
5+
package reduxer // import "github.com/wabarc/wayback/reduxer"
6+
7+
import (
8+
"net/url"
9+
"os"
10+
"testing"
11+
)
12+
13+
const (
14+
host = `https://www.youtube.com`
15+
domain = `youtube.com`
16+
)
17+
18+
var (
19+
validURL, _ = url.Parse(host)
20+
invalidURL = &url.URL{Host: `invalid-tld`}
21+
)
22+
23+
func TestBaseHost(t *testing.T) {
24+
var tests = []struct {
25+
url *url.URL
26+
exp string
27+
}{
28+
{validURL, domain},
29+
{invalidURL, ``},
30+
}
31+
32+
for _, test := range tests {
33+
t.Run("", func(t *testing.T) {
34+
dom, _ := baseHost(test.url)
35+
if dom != test.exp {
36+
t.Errorf(`Unexpected extract base host, got %v instead of %v`, dom, test.exp)
37+
}
38+
})
39+
}
40+
}
41+
42+
func TestSupportedMediaSite(t *testing.T) {
43+
extraDomain := "https://extra-domain.com"
44+
missing, _ := url.Parse("https://missing.com")
45+
extraURL, _ := url.Parse(extraDomain)
46+
47+
var tests = []struct {
48+
url *url.URL
49+
testname string
50+
filename string
51+
extra string
52+
supported bool
53+
}{
54+
{validURL, `test with valid url`, filename, ``, true},
55+
{invalidURL, `test with invalid url`, filename, ``, false},
56+
{missing, `test not found`, filename, ``, false},
57+
{extraURL, `test extra sites`, filename, extraDomain, true},
58+
{invalidURL, `test extra invalid sites`, filename, extraDomain, false},
59+
{invalidURL, `test sites configuration file not exists`, `/path/not/exists`, extraDomain, false},
60+
}
61+
62+
for _, test := range tests {
63+
t.Run(test.testname, func(t *testing.T) {
64+
os.Setenv("WAYBACK_MEDIA_SITES", test.extra)
65+
parseMediaSites(test.filename)
66+
supported := supportedMediaSite(test.url)
67+
if supported != test.supported {
68+
t.Errorf(`Unexpected check download media supported, got %v instead of %v`, supported, test.supported)
69+
}
70+
})
71+
}
72+
}

reduxer/reduxer.go

+4-2
Original file line numberDiff line numberDiff line change
@@ -226,8 +226,10 @@ func Do(ctx context.Context, urls ...*url.URL) (Reduxer, error) {
226226
if err := helper.SetField(&artifact.WARC, "Local", craft(u)); err != nil {
227227
logger.Error("assign field WARC to path struct failed: %v", err)
228228
}
229-
if err := helper.SetField(&artifact.Media, "Local", media(ctx, dir, shot.URL)); err != nil {
230-
logger.Error("assign field Media to path struct failed: %v", err)
229+
if supportedMediaSite(u) {
230+
if err := helper.SetField(&artifact.Media, "Local", media(ctx, dir, shot.URL)); err != nil {
231+
logger.Error("assign field Media to path struct failed: %v", err)
232+
}
231233
}
232234
// Attach single file
233235
singleFilePath := singleFile(ctx, bytes.NewReader(shot.HTML), dir, shot.URL)

reduxer/sites

+82
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
365yg.com
2+
56.com
3+
7gogo.jp
4+
acfun.cn
5+
alive.in.th
6+
archive.org
7+
baidu.com
8+
bandcamp.com
9+
baomihua.com
10+
bilibili.com
11+
cbs.com
12+
cntv.cn
13+
coub.com
14+
dailymotion.com
15+
douban.com
16+
douyin.com
17+
douyutv.com
18+
ehow.com
19+
facebook.com
20+
fc2.com
21+
flickr.com
22+
freesound.org
23+
fun.tv
24+
heavy-music.ru
25+
huomao.com
26+
imgur.com
27+
infoq.com
28+
instagram.com
29+
interest.me
30+
iqiyi.com
31+
isuntv.com
32+
ixigua.com
33+
joy.cn
34+
khanacademy.org
35+
ku6.com
36+
kuaishou.com
37+
kugou.com
38+
kuwo.cn
39+
le.com
40+
lizhi.fm
41+
lrts.me
42+
magisto.com
43+
metacafe.com
44+
mgtv.com
45+
miaopai.com
46+
miomio.tv
47+
missevan.com
48+
mixcloud.com
49+
mtv81.com
50+
naver.com
51+
nicovideo.jp
52+
pinterest.com
53+
pixnet.net
54+
pptv.com
55+
qq.com
56+
reddit.com
57+
showroom-live.com
58+
sina.com.cn
59+
slideshare.net
60+
sohu.com
61+
soundcloud.com
62+
ted.com
63+
tiktok.com
64+
tudou.com
65+
tumblr.com
66+
twitch.tv
67+
twitter.com
68+
veoh.com
69+
v.ifeng.com
70+
vimeo.com
71+
vine.co
72+
v.iqilu.com
73+
vk.com
74+
weibo.cn
75+
weibo.com
76+
xiami.com
77+
xinpianchang.com
78+
youku.com
79+
youtube.com
80+
zhanqi.tv
81+
zhibo.tv
82+
zhihu.com

wayback.1

+3
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,9 @@ Directory to store binary file, e.g. PDF, html file\&.
192192
.B WAYBACK_MAX_MEDIA_SIZE
193193
Max size to limit download stream media. default 512MB\&.
194194
.TP
195+
.B WAYBACK_MEDIA_SITES
196+
Extra media websites wish to be supported, separate with comma\&.
197+
.TP
195198
.B WAYBACK_TELEGRAM_TOKEN
196199
Telegram Bot API Token. (same as flag --token)\&.
197200
.TP

wayback.conf

+1
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ CHROME_REMOTE_ADDR=127.0.0.1:9222
5656
WAYBACK_POOLING_SIZE=3
5757
WAYBACK_STORAGE_DIR=
5858
WAYBACK_MAX_MEDIA_SIZE=512MB
59+
WAYBACK_MEDIA_SITES=
5960
WAYBACK_TIMEOUT=300
6061
WAYBACK_USERAGENT=WaybackArchiver/1.0
6162
WAYBACK_FALLBACK=off

0 commit comments

Comments
 (0)