如何做财报的桑基图 | 一个无人问津的小站

事情的起因来源于这张图，通过 GPT 获取财报 PDF 的数据，然后绘制一个桑基图。
思路其实比较简单：
通过 GPT 获取数据，整理成一个结构化的数据
做一个桑基图渲染的模板
把数据渲染出来
本来做到一半，GPT 代码都写得七七八八了，但是美化不够，效果这样。
搜索模板的时候，找到这么个网站： https://www.sankeyart.com/sankeys/72/ 直接在线就能可视化数据。而且大一点的公司，人家都做好了。 https://www.sankeyart.com/sankeys/public/31890/
算了，不做了，代码备份一个：
import pandas as pd
import plotly.graph_objects as go

def plot_balanced_sankey(
    df: pd.DataFrame,
    level_colors: dict[str, str] = None,
    title: str = "收支流向图",
    width: int = 800,
    height: int = 800,
) -> None:
    """
    绘制收支平衡的桑基图
    
    参数:
        df: 包含 source, target, value, level 的DataFrame
        level_colors: 每个层级的颜色字典
        title: 图表标题
        width: 图表宽度
        height: 图表高度
    """
    # 计算每个节点的总流入/流出值
    node_values = {}
    for idx, row in df.iterrows():
        if row['source'] not in node_values:
            node_values[row['source']] = {'out': 0, 'in': 0}
        if row['target'] not in node_values:
            node_values[row['target']] = {'out': 0, 'in': 0}
        node_values[row['source']]['out'] += row['value']
        node_values[row['target']]['in'] += row['value']
    
    # 获取所有唯一节点
    all_nodes = list(set(df['source'].unique()) | set(df['target'].unique()))
    node_dict = {node: i for i, node in enumerate(all_nodes)}
    
    # 计算总额（用于百分比）
    total_value = df[df['source'] == '总额']['value'].sum()
    
    # 准备节点标签
    node_labels = []
    for node in all_nodes:
        value = max(node_values[node]['in'], node_values[node]['out'])
        percentage = value / total_value * 100
        if node == '总额':
            label = f"{node}\n{value:,.0f}M"
        else:
            label = f"{node}\n{value:,.0f}M\n({percentage:.1f}%)"
        node_labels.append(label)
    
    # 准备节点颜色
    if level_colors is None:
        level_colors = {
            'L1': 'rgba(173, 216, 230, 0.7)',    # 浅蓝
            'L2': 'rgba(135, 206, 235, 0.7)',    # 天蓝
            'L3': 'rgba(100, 149, 237, 0.7)',    # 矢车菊蓝
            'M': 'rgba(169, 169, 169, 0.7)',     # 中间灰色
            'R1': 'rgba(144, 238, 144, 0.7)',    # 浅绿
            'R2': 'rgba(152, 251, 152, 0.7)',    # 淡绿
            'R3': 'rgba(143, 188, 143, 0.7)'     # 深绿
        }
    
    node_colors = []
    for node in all_nodes:
        level = df[(df['source'] == node) | (df['target'] == node)]['level'].iloc[0]
        node_colors.append(level_colors[level])
    
    # 准备连接线颜色
    link_colors = []
    for _, row in df.iterrows():
        source_level = df[(df['source'] == row['source']) | 
                         (df['target'] == row['source'])]['level'].iloc[0]
        if source_level.startswith('L'):
            link_colors.append('rgba(173, 216, 230, 0.3)')  # 收入侧
        elif source_level.startswith('R'):
            link_colors.append('rgba(144, 238, 144, 0.3)')  # 支出侧
        else:
            link_colors.append('rgba(169, 169, 169, 0.3)')  # 中间过渡
    
    # 创建桑基图
    fig = go.Figure(data=[go.Sankey(
        node = dict(
            pad = 20,
            thickness = 25,
            line = dict(color = "black", width = 0.5),
            label = node_labels,
            color = node_colors
        ),
        link = dict(
            source = [node_dict[src] for src in df['source']],
            target = [node_dict[tgt] for tgt in df['target']],
            value = df['value'],
            color = link_colors
        )
    )])
    
    # 设置布局
    fig.update_layout(
        title = dict(
            text = f"{title}<br><sub>单位：百万元</sub>",
            font = dict(size=20),
            x = 0.5,
            y = 0.95
        ),
        font = dict(size=12),
        width = width,
        height = height,
        paper_bgcolor = 'rgba(0,0,0,0)',
        plot_bgcolor = 'rgba(0,0,0,0)',
        margin = dict(t=100, l=80, r=80, b=80)
    )
    
    fig.show()

# 创建示例数据
# 创建示例数据
data = {
    'source': [
        # 收入部分（左侧）- 从细分到汇总
        '电商销售', 'APP销售',                     # L3->L2
        '直营店', '加盟店',                        # L3->L2
        '理财收益', '投资收益',                    # L3->L2
        
        '线上渠道', '线上渠道',                    # L2->L1
        '线下渠道', '线下渠道',                    # L2->L1
        '其他收入', '其他收入',                    # L2->L1
        
        '主营收入', '主营收入', '其他收入',        # L1->M
        
        # 支出部分（右侧）- 从中间到细分
        '总额', '总额', '总额', '总额',            # M->R1
        
        '营业成本', '营业成本', '营业成本',        # R1->R2
        '运营费用', '运营费用', '运营费用',        # R1->R2
        '税费', '税费',                           # R1->R2
        '净利润', '净利润', '净利润'               # R1->R2
    ],
    'target': [
        # 收入部分（左侧）
        '线上渠道', '线上渠道',                    # L3->L2
        '线下渠道', '线下渠道',                    # L3->L2
        '其他收入', '其他收入',                    # L3->L2
        
        '主营收入', '主营收入',                    # L2->L1
        '主营收入', '主营收入',                    # L2->L1
        '其他收入', '其他收入',                    # L2->L1
        
        '总额', '总额', '总额',                    # L1->M
        
        # 支出部分（右侧）
        '营业成本', '运营费用', '税费', '净利润',   # M->R1
        
        '原材料', '人工成本', '制造费用',          # R1->R2
        '销售费用', '管理费用', '研发费用',        # R1->R2
        '所得税', '其他税费',                      # R1->R2
        '股东分红', '公司留存', '员工奖金'         # R1->R2
    ],
    'value': [
        # 收入明细（从细分到汇总）
        300, 200,        # 电商和APP -> 线上渠道
        250, 150,        # 直营和加盟 -> 线下渠道
        80, 120,         # 理财和投资 -> 其他收入
        
        500, 200,        # 线上渠道 -> 主营收入
        400, 100,        # 线下渠道 -> 主营收入
        80, 120,         # 其他收入汇总
        
        800, 400, 200,   # 到总额
        
        # 支出明细（从总额到细分）
        400, 300, 100, 200,   # 总额分配
        
        200, 120, 80,         # 营业成本细分
        150, 100, 50,         # 运营费用细分
        60, 40,               # 税费细分
        100, 60, 40          # 净利润细分
    ],
    'level': [
        # 收入层级（从外到内）
        'L3', 'L3',
        'L3', 'L3',
        'L3', 'L3',
        
        'L2', 'L2',
        'L2', 'L2',
        'L2', 'L2',
        
        'L1', 'L1', 'L1',
        
        # 中间和支出层级
        'M', 'M', 'M', 'M',
        
        'R1', 'R1', 'R1',
        'R1', 'R1', 'R1',
        'R1', 'R1',
        'R2', 'R2', 'R2'
    ]
}

df = pd.DataFrame(data)

# 定义层级颜色（从外到内的渐变）
level_colors = {
    'L3': 'rgba(100, 149, 237, 0.7)',    # 深蓝
    'L2': 'rgba(135, 206, 235, 0.7)',    # 中蓝
    'L1': 'rgba(173, 216, 230, 0.7)',    # 浅蓝
    'M':  'rgba(169, 169, 169, 0.7)',    # 中间灰色
    'R1': 'rgba(144, 238, 144, 0.7)',    # 浅绿
    'R2': 'rgba(85, 107, 47, 0.7)'       # 深绿
}

# 绘制桑基图
plot_balanced_sankey(
    df=df,
    level_colors=level_colors,
    title="公司收支流向图 2023年度"
)

df = pd.DataFrame(data)

# 定义层级颜色
level_colors = {
    'L1': 'rgba(173, 216, 230, 0.7)',    # 浅蓝
    'L2': 'rgba(135, 206, 235, 0.7)',    # 天蓝
    'M': 'rgba(169, 169, 169, 0.7)',     # 中间灰色
    'R1': 'rgba(144, 238, 144, 0.7)',    # 浅绿
    'R2': 'rgba(152, 251, 152, 0.7)',    # 淡绿
}

# 绘制桑基图
plot_balanced_sankey(
    df=df,
    level_colors=level_colors,
    title="公司收支流向图"
)