Skip to content

Commit 44c611d

Browse files
committed
facebook scraping: fix post id, stop using story_fbid query param
1 parent e382533 commit 44c611d

File tree

4 files changed

+27
-8
lines changed

4 files changed

+27
-8
lines changed

README.md

+2
Original file line numberDiff line numberDiff line change
@@ -297,6 +297,8 @@ Changelog
297297
* Twitter
298298
* Trim alt text in line between post preview and creation
299299
* Correctly trim Twitter alt text
300+
* Facebook
301+
* Scraping: extract post id from `_ft_` query param instead of `story_fbid`, which is now an opaque token that changes regularly. ([facebook-atom#27](https://github.com/snarfed/facebook-atom/issues/27))
300302

301303
### 4.0 - 2022-03-23
302304

granary/facebook.py

+15-2
Original file line numberDiff line numberDiff line change
@@ -1853,10 +1853,22 @@ def scraped_to_activities(self, scraped, log_html=False, **kwargs):
18531853
logger.debug('Skipping "Suggested for you"')
18541854
continue
18551855

1856-
url = self._sanitize_url(urllib.parse.urljoin(self.BASE_URL, permalink['href']))
1856+
url = urllib.parse.urljoin(self.BASE_URL, permalink['href'])
18571857
query = urllib.parse.urlparse(url).query
18581858
parsed = urllib.parse.parse_qs(query)
1859-
post_id = parsed['story_fbid'][0]
1859+
1860+
# story_fbid stopped being useful in May 2022, it switched to an opaque
1861+
# token that changes regularly, even for the same post.
1862+
# https://github.com/snarfed/facebook-atom/issues/27
1863+
ft = util.get_first(parsed, '_ft_') or ''
1864+
for elem in ft.split(':'):
1865+
if elem.startswith('top_level_post_id.') or elem.startswith('mf_objid.'):
1866+
post_id = elem.split('.')[1]
1867+
if post_id:
1868+
break
1869+
else:
1870+
post_id = util.get_first(parsed, 'story_fbid')
1871+
18601872
owner_id = parsed['id'][0]
18611873

18621874
author = self._m_html_author(post)
@@ -1923,6 +1935,7 @@ def scraped_to_activities(self, scraped, log_html=False, **kwargs):
19231935
if util.is_int(count_text):
19241936
reactions_count = int(count_text)
19251937

1938+
url = self._sanitize_url(url)
19261939
activities.append({
19271940
'objectType': 'activity',
19281941
'verb': 'post',

granary/tests/test_facebook.py

+7-3
Original file line numberDiff line numberDiff line change
@@ -1202,6 +1202,10 @@ def read_testdata(filename):
12021202
'actor': obj['author'],
12031203
'object': obj,
12041204
} for obj in MBASIC_OBJS]
1205+
MBASIC_FEED_ACTIVITIES = copy.deepcopy(MBASIC_ACTIVITIES)
1206+
MBASIC_FEED_ACTIVITIES[1]['url'] = \
1207+
MBASIC_FEED_ACTIVITIES[1]['object']['url'] = \
1208+
'https://www.facebook.com/story.php?story_fbid=456-story&id=212038'
12051209

12061210
MBASIC_PHOTO_ACTIVITY = {
12071211
'objectType': 'activity',
@@ -1866,7 +1870,7 @@ def test_get_activities_scrape_timeline(self):
18661870
self.mox.ReplayAll()
18671871

18681872
activities = self.fbscrape.get_activities(user_id='x', group_id=source.SELF)
1869-
self.assert_equals(MBASIC_ACTIVITIES, activities)
1873+
self.assert_equals(MBASIC_FEED_ACTIVITIES, activities)
18701874

18711875
def test_get_activities_scrape_timeline_fetch_replies_likes(self):
18721876
facebook.now_fn().MultipleTimes().AndReturn(datetime(1999, 1, 1))
@@ -3318,7 +3322,7 @@ def test_scraped_to_activities(self):
33183322
self.mox.ReplayAll()
33193323

33203324
got, _ = self.fb.scraped_to_activities(MBASIC_HTML_TIMELINE)
3321-
self.assert_equals(MBASIC_ACTIVITIES, got)
3325+
self.assert_equals(MBASIC_FEED_ACTIVITIES, got)
33223326

33233327
def test_scraped_to_activities_no_content(self):
33243328
soup = util.parse_html(MBASIC_HTML_TIMELINE)
@@ -3327,7 +3331,7 @@ def test_scraped_to_activities_no_content(self):
33273331
facebook.now_fn().MultipleTimes().AndReturn(datetime(1999, 1, 1))
33283332
self.mox.ReplayAll()
33293333

3330-
expected = copy.deepcopy(MBASIC_ACTIVITIES)
3334+
expected = copy.deepcopy(MBASIC_FEED_ACTIVITIES)
33313335
expected[0]['object']['content'] = '<div class="widePic">\n\n</div>'
33323336
got, _ = self.fb.scraped_to_activities(str(soup))
33333337
self.assert_equals(expected, got)

granary/tests/testdata/facebook.mbasic.feed.html

+3-3
Original file line numberDiff line numberDiff line change
@@ -280,7 +280,7 @@ <h3 class="dx dg dy ee">
280280
</div>
281281
<div class="cm cn">
282282
<span class="cl cm" id="like_456">
283-
<a class="dy dz" aria-label="145 reactions, including Like, Love and Care" href="/story.php?story_fbid=456&amp;id=212038&amp;refid=28&amp;_ft_=...">
283+
<a class="dy dz" aria-label="145 reactions, including Like, Love and Care" href="/story.php?story_fbid=456-story&amp;id=212038&amp;refid=28&amp;_ft_=qid.-6283032955...%3Amf_story_key.354173070...%3Atop_level_post_id.456%3Apage_id.2276775...%3Aweight.3133.86...%3Asty.263%3Amf_objid.456">
284284
<span class="ea">
285285
</span>145
286286
</a>
@@ -293,11 +293,11 @@ <h3 class="dx dg dy ee">
293293
</span>
294294
<span aria-hidden="true"> ·
295295
</span>
296-
<a href="/story.php?story_fbid=456&id=212038&refid=8&_ft_=..." class="dz">55 Comments
296+
<a href="/story.php?story_fbid=456-story&id=212038&refid=8&_ft_=qid.-6283032955...%3Amf_story_key.354173070...%3Atop_level_post_id.456%3Apage_id.2276775...%3Aweight.3133.86...%3Asty.263%3Amf_objid.456" class="dz">55 Comments
297297
</a>
298298
<span aria-hidden="true"> ·
299299
</span>
300-
<a href="/story.php?story_fbid=456&amp;id=212038&amp;refid=8&amp;_ft_=...">Full Story
300+
<a href="/story.php?story_fbid=456-story&amp;id=212038&amp;refid=8&amp;_ft_=qid.-6283032955...%3Amf_story_key.354173070...%3Atop_level_post_id.456%3Apage_id.2276775...%3Aweight.3133.86...%3Asty.263%3Amf_objid.456">Full Story
301301
</a>
302302
<span aria-hidden="true"> ·
303303
</span>

0 commit comments

Comments
 (0)