日常在浏览 Twitter 的时候,看到觉得不错的内容就存到了书签,但是 X 的在线检索功能很弱,而且搜索也很慢,所以计划把书签内容都保存到本地,方便偶尔的查阅。

找到 2 个不错的脚本,一个解决了书签页面滚动问题,一个可以导出所有数据。

X (推特) 书签滚动位置保持器
twitter-web-exporter

用法

使用 twitter-web-exporter,加载完所有书签后,导出为 JSON 数据。最后写一个脚本,把书签内容转换成 markdown,自动生成标签和链接。

脚本如下

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import json
import os
import re
from datetime import datetime
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer

# 停用词列表
STOP_WORDS = set(['的', '了', '和', '是', '就', '都', '而', '及', '与', '着', '或', '一个', '没有', '我们', '你们', '他们', '它们', '有些', '这个', '那个', '这些', '那些', '这样', '那样', '如此', '只是', '但是', '不过', '然而', '可是', '虽然', '因为', '所以', '因此', '于是', '故而', '虽然', '尽管', '不过', '不仅', '而且', '并且', '然后', '接着', '其次', '最后', '总之', '总的来说', '换句话说', '也就是说', 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'from', 'up', 'about', 'into', 'over', 'after', 'beneath', 'under', 'above'])

def sanitize_filename(filename):
return "".join([c for c in filename if c.isalpha() or c.isdigit() or c in ' -_.']).rstrip()

def create_markdown(tweet, tags, related_tweets):
content = f"# {tweet['name']} (@{tweet['screen_name']})\n\n"
content += f"**Date:** {tweet['created_at']}\n\n"
content += f"{tweet['full_text']}\n\n"

if tweet['media']:
content += "**Media:**\n\n"
for media in tweet['media']:
if media['type'] == 'photo':
content += f"![{media['type']}]({media['original']})\n\n"
elif media['type'] == 'video':
content += f"[Video]({media['original']})\n\n"

if tags:
content += "**Tags:** " + ", ".join([f"#{tag}" for tag in tags]) + "\n\n"

if related_tweets:
content += "**Related Tweets:**\n\n"
for related_tweet in related_tweets:
content += f"- [[{related_tweet}]]\n"
content += "\n"

content += "---\n"
content += f"Favorite count: {tweet['favorite_count']}\n"
content += f"Retweet count: {tweet['retweet_count']}\n"
content += f"Bookmark count: {tweet['bookmark_count']}\n"
content += f"View count: {tweet['views_count']}\n"
content += f"URL: {tweet['url']}\n"

return content

def remove_urls(text):
return re.sub(r'http\S+', '', text)

def get_filename(tweet):
date = datetime.strptime(tweet['created_at'], "%Y-%m-%d %H:%M:%S %z")
return f"{date.strftime('%Y%m%d')}_{sanitize_filename(tweet['name'])}_{tweet['id']}.md"

def is_valid_tag(word):
# 允许中文词和长度大于 1 的英文词
return (len(word) > 1 and not word.isdigit() and word.lower() not in STOP_WORDS) and \
(re.match(r'^[\u4e00-\u9fa5]+$', word) or re.match(r'^[a-zA-Z]{2,}$', word))

def extract_words(text):
text = remove_urls(text)
# 使用正则表达式分割中英文
words = re.findall(r'\w+|[\u4e00-\u9fa5]+', text)
# 对中文进行分词
chinese_words = []
for word in words:
if re.match(r'^[\u4e00-\u9fa5]+$', word):
chinese_words.extend(jieba.cut(word))
else:
chinese_words.append(word)
return [word for word in chinese_words if is_valid_tag(word)]

def main():
output_dir = 'output'
os.makedirs(output_dir, exist_ok=True)

with open('twitter-1728361455821.json', 'r', encoding='utf-8') as file:
tweets = json.load(file)

texts = [' '.join(extract_words(tweet['full_text'])) for tweet in tweets]

vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, max_features=1000)
tfidf_matrix = vectorizer.fit_transform(texts)
feature_names = vectorizer.get_feature_names_out()

tweet_tags = {}
related_tweets = {get_filename(tweet): set() for tweet in tweets}

for i, tweet in enumerate(tweets):
filename = get_filename(tweet)

feature_index = tfidf_matrix[i,:].nonzero()[1]
tfidf_scores = zip(feature_index, [tfidf_matrix[i, x] for x in feature_index])

tags = sorted([(feature_names[i], s) for (i, s) in tfidf_scores], key=lambda x: x[1], reverse=True)
tags = [tag for tag, _ in tags if is_valid_tag(tag)][:5]

tweet_tags[filename] = set(tags)

# 查找相关推文
for other_filename, other_tags in tweet_tags.items():
if other_filename != filename:
common_tags = tweet_tags[filename].intersection(other_tags)
if len(common_tags) >= 2: # 至少有两个共同标签才建立链接
related_tweets[filename].add(other_filename)
related_tweets[other_filename].add(filename)

markdown_content = create_markdown(tweet, tags, related_tweets[filename])

with open(os.path.join(output_dir, filename), 'w', encoding='utf-8') as md_file:
md_file.write(markdown_content)

print(f"处理完成。共生成 {len(tweets)} 个 markdown 文件。")

if __name__ == "__main__":
main()