| import json import os import re from datetime import datetime import jieba from sklearn.feature_extraction.text import TfidfVectorizer
STOP_WORDS = set(['的', '了', '和', '是', '就', '都', '而', '及', '与', '着', '或', '一个', '没有', '我们', '你们', '他们', '它们', '有些', '这个', '那个', '这些', '那些', '这样', '那样', '如此', '只是', '但是', '不过', '然而', '可是', '虽然', '因为', '所以', '因此', '于是', '故而', '虽然', '尽管', '不过', '不仅', '而且', '并且', '然后', '接着', '其次', '最后', '总之', '总的来说', '换句话说', '也就是说', 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'from', 'up', 'about', 'into', 'over', 'after', 'beneath', 'under', 'above'])
def sanitize_filename(filename): return "".join([c for c in filename if c.isalpha() or c.isdigit() or c in ' -_.']).rstrip()
def create_markdown(tweet, tags, related_tweets): content = f"# {tweet['name']} (@{tweet['screen_name']})\n\n" content += f"**Date:** {tweet['created_at']}\n\n" content += f"{tweet['full_text']}\n\n" if tweet['media']: content += "**Media:**\n\n" for media in tweet['media']: if media['type'] == 'photo': content += f"![{media['type']}]({media['original']})\n\n" elif media['type'] == 'video': content += f"[Video]({media['original']})\n\n" if tags: content += "**Tags:** " + ", ".join([f"#{tag}" for tag in tags]) + "\n\n" if related_tweets: content += "**Related Tweets:**\n\n" for related_tweet in related_tweets: content += f"- [[{related_tweet}]]\n" content += "\n" content += "---\n" content += f"Favorite count: {tweet['favorite_count']}\n" content += f"Retweet count: {tweet['retweet_count']}\n" content += f"Bookmark count: {tweet['bookmark_count']}\n" content += f"View count: {tweet['views_count']}\n" content += f"URL: {tweet['url']}\n"
return content
def remove_urls(text): return re.sub(r'http\S+', '', text) def get_filename(tweet): date = datetime.strptime(tweet['created_at'], "%Y-%m-%d %H:%M:%S %z") return f"{date.strftime('%Y%m%d')}_{sanitize_filename(tweet['name'])}_{tweet['id']}.md"
def is_valid_tag(word): return (len(word) > 1 and not word.isdigit() and word.lower() not in STOP_WORDS) and \ (re.match(r'^[\u4e00-\u9fa5]+$', word) or re.match(r'^[a-zA-Z]{2,}$', word))
def extract_words(text): text = remove_urls(text) words = re.findall(r'\w+|[\u4e00-\u9fa5]+', text) chinese_words = [] for word in words: if re.match(r'^[\u4e00-\u9fa5]+$', word): chinese_words.extend(jieba.cut(word)) else: chinese_words.append(word) return [word for word in chinese_words if is_valid_tag(word)]
def main(): output_dir = 'output' os.makedirs(output_dir, exist_ok=True)
with open('twitter-1728361455821.json', 'r', encoding='utf-8') as file: tweets = json.load(file)
texts = [' '.join(extract_words(tweet['full_text'])) for tweet in tweets]
vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, max_features=1000) tfidf_matrix = vectorizer.fit_transform(texts) feature_names = vectorizer.get_feature_names_out()
tweet_tags = {} related_tweets = {get_filename(tweet): set() for tweet in tweets}
for i, tweet in enumerate(tweets): filename = get_filename(tweet) feature_index = tfidf_matrix[i,:].nonzero()[1] tfidf_scores = zip(feature_index, [tfidf_matrix[i, x] for x in feature_index]) tags = sorted([(feature_names[i], s) for (i, s) in tfidf_scores], key=lambda x: x[1], reverse=True) tags = [tag for tag, _ in tags if is_valid_tag(tag)][:5] tweet_tags[filename] = set(tags) for other_filename, other_tags in tweet_tags.items(): if other_filename != filename: common_tags = tweet_tags[filename].intersection(other_tags) if len(common_tags) >= 2: related_tweets[filename].add(other_filename) related_tweets[other_filename].add(filename) markdown_content = create_markdown(tweet, tags, related_tweets[filename]) with open(os.path.join(output_dir, filename), 'w', encoding='utf-8') as md_file: md_file.write(markdown_content)
print(f"处理完成。共生成 {len(tweets)} 个 markdown 文件。")
if __name__ == "__main__": main()