123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657 |
- import pandas as pd
- import numpy as np
- def calculate_ed(y, genotype):
- """
- 计算ED值
- :param y: 表现型数据,一维numpy数组
- :param genotype: SNP基因型数据,Pandas DataFrame,行数为个体数量,列数为SNP位点数
- :return: ED值
- """
- # 检查输入数据是否合法
- if not isinstance(y, np.ndarray) or not isinstance(genotype, pd.DataFrame):
- raise TypeError("输入数据格式错误")
- # 计算每个个体的基因型得分
- score = genotype.sum(axis=1)
- # 计算每个个体的方差
- var_score = np.var(score)
- # 计算每个个体与平均表现型的差异的平方和
- sum_delta_y_sq = np.sum((y - np.mean(y))**2)
- # 计算ED值
- ed = sum_delta_y_sq / var_score
- return ed
- def main():
- # 读取MAF文件
- # df = pd.read_csv('example.maf', sep='\t', comment='#')
- # 读取MAF文件
- df = pd.read_csv('69dbfe7c-3efc-4126-b36d-2b59aca7a16d.wxs.aliquot_ensemble_masked.maf', sep='\t', comment='#')
- # 遍历每一列并将非数字值转换为 NaN
- for col in df.columns:
- # 将每列中的数据转换为数字类型
- df[col] = pd.to_numeric(df[col], errors='coerce')
- # 会使用 apply() 方法来遍历每列数据,并将大于等于1000的值设置为 NaN。如果你只想保留小于10000的数字,可以将 x < 1000 改为 x >= 10000。
- df[col] = df[col].apply(lambda x: x if x is np.nan or x < 10000 else np.nan)
- # 删除包含 NaN 的列
- df = df.dropna(axis=1)
- # 提取表现型数据和基因型数据
- # y = df.iloc[:, 0].values
- # 这里提取关键数据作为对比用户上传的SNP数据来进行对比,如果ed值越高,则意味着改用户换胃癌的风险就越高
- lst = [75, 420, 73, 52, 72, 62, 82, 36, 102, 56, 90, 224, 160, 66, 99, 87, 24, 93, 85, 73, 70, 38, 52, 159, 102, 37,
- 37, 33, 49, 65, 20, 67, 74, 84, 64, 85, 16, 12, 42, 60, 38, 49, 86, 72, 97, 176, 63, 76, 61, 54, 153, 39, 53,
- 70, 118, 102, 25, 49, 17, 114, 48, 52, 110, 24, 90, 61, 48, 71, 30, 48, 41, 32, 56, 59, 52, 84, 52, 53, 47,
- 117, 152, 48, 68, 93, 116]
- y = np.array(lst)
- genotype = df.iloc[:, 1:].to_dict()
- # 计算 ED 值,并保留3哥小数位
- ed = round(calculate_ed(y, pd.DataFrame(genotype)), 3)
- # print('ed', ed)
- return ed
- main()
|