In [3]:
import pandas as pd
import matplotlib.pyplot as plt
In [17]:
TCGA_BRCA_MC3_Public=pd.read_csv("/Users/reneewang/Documents/classAI/mc3_BRCA_mc3.txt", sep='\t')
In [18]:
TCGA_BRCA_MC3
Out[18]:
sample | chr | start | end | reference | alt | gene | effect | Amino_Acid_Change | DNA_VAF | SIFT | PolyPhen | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | TCGA-3C-AAAU-01 | 10 | 122668955 | 122668955 | G | A | WDR11 | 3'UTR | NaN | 0.39 | NaN | NaN |
1 | TCGA-3C-AAAU-01 | 10 | 8115874 | 8115875 | - | A | GATA3 | Frame_Shift_Ins | p.P409Afs*99 | 0.34 | NaN | NaN |
2 | TCGA-3C-AAAU-01 | 11 | 65272906 | 65272908 | AAA | - | MALAT1 | RNA | NaN | 0.27 | NaN | NaN |
3 | TCGA-3C-AAAU-01 | 11 | 66082467 | 66082467 | C | T | CD248 | Missense_Mutation | p.E678K | 0.07 | tolerated(0.12) | benign(0.001) |
4 | TCGA-3C-AAAU-01 | 11 | 66193652 | 66193652 | G | C | NPAS4 | 3'UTR | NaN | 0.20 | NaN | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
92114 | TCGA-Z7-A8R6-01 | 9 | 95396703 | 95396703 | C | T | IPPK | Missense_Mutation | p.E379K | 0.16 | deleterious(0.01) | probably_damaging(0.968) |
92115 | TCGA-Z7-A8R6-01 | X | 123217344 | 123217344 | C | T | STAG2 | Missense_Mutation | p.L1000F | 0.39 | deleterious(0) | probably_damaging(1) |
92116 | TCGA-Z7-A8R6-01 | X | 30671631 | 30671631 | G | A | GK | 5'UTR | NaN | 0.36 | NaN | NaN |
92117 | TCGA-Z7-A8R6-01 | X | 51151398 | 51151398 | C | G | CXorf67 | 3'UTR | NaN | 0.32 | NaN | NaN |
92118 | TCGA-Z7-A8R6-01 | X | 54014379 | 54014379 | T | A | PHF8 | Splice_Site | p.X613_splice | 0.07 | NaN | NaN |
92119 rows × 12 columns
In [19]:
import os
print(os.getcwd())
/Users/reneewang/anaconda_projects/b8b15617-c3b6-46e2-ba19-2b28d4764e55
In [20]:
TCGA_BRCA_MC3_Public
Out[20]:
sample | chr | start | end | reference | alt | gene | effect | Amino_Acid_Change | DNA_VAF | SIFT | PolyPhen | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | TCGA-3C-AAAU-01 | 10 | 122668955 | 122668955 | G | A | WDR11 | 3'UTR | NaN | 0.39 | NaN | NaN |
1 | TCGA-3C-AAAU-01 | 10 | 8115874 | 8115875 | - | A | GATA3 | Frame_Shift_Ins | p.P409Afs*99 | 0.34 | NaN | NaN |
2 | TCGA-3C-AAAU-01 | 11 | 65272906 | 65272908 | AAA | - | MALAT1 | RNA | NaN | 0.27 | NaN | NaN |
3 | TCGA-3C-AAAU-01 | 11 | 66082467 | 66082467 | C | T | CD248 | Missense_Mutation | p.E678K | 0.07 | tolerated(0.12) | benign(0.001) |
4 | TCGA-3C-AAAU-01 | 11 | 66193652 | 66193652 | G | C | NPAS4 | 3'UTR | NaN | 0.20 | NaN | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
92114 | TCGA-Z7-A8R6-01 | 9 | 95396703 | 95396703 | C | T | IPPK | Missense_Mutation | p.E379K | 0.16 | deleterious(0.01) | probably_damaging(0.968) |
92115 | TCGA-Z7-A8R6-01 | X | 123217344 | 123217344 | C | T | STAG2 | Missense_Mutation | p.L1000F | 0.39 | deleterious(0) | probably_damaging(1) |
92116 | TCGA-Z7-A8R6-01 | X | 30671631 | 30671631 | G | A | GK | 5'UTR | NaN | 0.36 | NaN | NaN |
92117 | TCGA-Z7-A8R6-01 | X | 51151398 | 51151398 | C | G | CXorf67 | 3'UTR | NaN | 0.32 | NaN | NaN |
92118 | TCGA-Z7-A8R6-01 | X | 54014379 | 54014379 | T | A | PHF8 | Splice_Site | p.X613_splice | 0.07 | NaN | NaN |
92119 rows × 12 columns
In [23]:
mutation_type_counts=TCGA_BRCA_MC3_Public['effect'].value_counts()
mutation_type_counts
Out[23]:
effect Missense_Mutation 45634 Silent 17122 Frame_Shift_Del 8522 3'UTR 6695 Nonsense_Mutation 3666 Intron 3212 5'UTR 2492 Splice_Site 1399 RNA 1160 Frame_Shift_Ins 610 3'Flank 530 5'Flank 443 In_Frame_Del 441 Translation_Start_Site 74 Nonstop_Mutation 66 In_Frame_Ins 34 large deletion 19 Name: count, dtype: int64
In [24]:
sample_types=TCGA_BRCA_MC3_Public['sample'].value_counts()
sample_types
Out[24]:
sample TCGA-AC-A23H-01 6405 TCGA-EW-A2FV-01 4231 TCGA-D8-A27V-01 3332 TCGA-5L-AAT1-01 1995 TCGA-BH-A18G-01 1899 ... TCGA-AO-A03U-01 7 TCGA-A2-A25F-01 6 TCGA-LL-A440-01 6 TCGA-EW-A1P1-01 3 TCGA-AC-A2FK-01 3 Name: count, Length: 791, dtype: int64
In [31]:
gene_types=TCGA_BRCA_MC3_Public['gene'].value_counts()
gene_types
Out[31]:
gene PIK3CA 315 TTN 285 TP53 273 MUC16 141 CDH1 108 ... ZNF587B 1 BLZF1 1 SLPI 1 PLTP 1 FOXQ1 1 Name: count, Length: 18065, dtype: int64
In [32]:
import matplotlib.pyplot as plt
top_genes = gene_types.head(25)
plt.figure(figsize=(10,6))
top_genes.plot(kind='barh', color='pink', edgecolor='black')
plt.xlabel("Number of Mutations")
plt.ylabel("Gene")
plt.title("Top 25 Most Frequently Mutated Genes in TCGA BRCA (MC3)")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()
In [33]:
import pandas as pd
import matplotlib.pyplot as plt
In [36]:
TCGA_UCEC_MC3_Public=pd.read_csv("/Users/reneewang/Desktop/classAI/mc3_UCEC_mc3.txt", sep='\t')
In [39]:
TCGA_UCEC_MC3_Public
Out[39]:
sample | chr | start | end | reference | alt | gene | effect | Amino_Acid_Change | DNA_VAF | SIFT | PolyPhen | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | TCGA-2E-A9G8-01 | 10 | 120354589 | 120354589 | C | G | PRLHR | Silent | p.V56V | 0.22 | NaN | NaN |
1 | TCGA-2E-A9G8-01 | 10 | 34626217 | 34626218 | - | T | PARD3 | Frame_Shift_Ins | p.S852Kfs*8 | 0.30 | NaN | NaN |
2 | TCGA-2E-A9G8-01 | 10 | 49939399 | 49939399 | G | A | WDFY4 | Silent | p.E458E | 0.30 | NaN | NaN |
3 | TCGA-2E-A9G8-01 | 10 | 50532105 | 50532105 | C | T | C10orf71 | Silent | p.D505D | 0.38 | NaN | NaN |
4 | TCGA-2E-A9G8-01 | 10 | 72639765 | 72639766 | TC | - | SGPL1 | 3'UTR | NaN | 0.27 | NaN | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
798651 | TCGA-SL-A6JA-01 | X | 85117942 | 85117943 | - | A | CHM | 3'UTR | NaN | 0.44 | NaN | NaN |
798652 | TCGA-SL-A6JA-01 | X | 85118095 | 85118096 | - | CC | CHM | 3'UTR | NaN | 0.11 | NaN | NaN |
798653 | TCGA-SL-A6JA-01 | X | 86919876 | 86919876 | G | A | KLHL4 | Missense_Mutation | p.G680R | 0.40 | deleterious(0.01) | probably_damaging(1) |
798654 | TCGA-SL-A6JA-01 | X | 92966183 | 92966184 | - | T | FAM133A | 3'UTR | NaN | 0.30 | NaN | NaN |
798655 | TCGA-SL-A6JA-01 | X | 9864479 | 9864479 | G | A | SHROOM2 | Missense_Mutation | p.R844H | 0.35 | tolerated(0.1) | benign(0) |
798656 rows × 12 columns
In [40]:
mutation_type_counts=TCGA_UCEC_MC3_Public['effect'].value_counts()
mutation_type_counts
Out[40]:
effect Missense_Mutation 387884 Silent 144249 3'UTR 122921 Nonsense_Mutation 36485 Intron 28109 5'UTR 18233 Frame_Shift_Del 14898 RNA 14087 Splice_Site 9617 3'Flank 9260 Frame_Shift_Ins 6225 5'Flank 4523 In_Frame_Del 1320 Translation_Start_Site 369 Nonstop_Mutation 357 In_Frame_Ins 90 large deletion 29 Name: count, dtype: int64
In [41]:
sample_types=TCGA_UCEC_MC3_Public['sample'].value_counts()
sample_types
Out[41]:
sample TCGA-EO-A22U-01 25534 TCGA-FI-A2D5-01 24192 TCGA-AX-A2HC-01 23087 TCGA-EO-A22R-01 22823 TCGA-B5-A3FC-01 22662 ... TCGA-AX-A2HF-01 32 TCGA-FI-A2CX-01 32 TCGA-AX-A1CC-01 30 TCGA-BG-A0YU-01 30 TCGA-QF-A5YT-01 12 Name: count, Length: 447, dtype: int64
In [42]:
gene_types=TCGA_UCEC_MC3_Public['gene'].value_counts()
gene_types
Out[42]:
gene TTN 2409 MUC16 726 PTEN 584 DST 576 CSMD3 535 ... SEC24B-AS1 1 MIRLET7C 1 AC002519.8 1 TRAJ39 1 AP006285.6 1 Name: count, Length: 20957, dtype: int64
In [43]:
import matplotlib.pyplot as plt
top_genes = gene_types.head(25)
plt.figure(figsize=(10,6))
top_genes.plot(kind='barh', color='purple', edgecolor='black')
plt.xlabel("Number of Mutations")
plt.ylabel("Gene")
plt.title("Top 25 Most Frequently Mutated Genes in TCGA UCEC (MC3)")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()
In [ ]: