In [3]:
import pandas as pd
import matplotlib.pyplot as plt
In [17]:
TCGA_BRCA_MC3_Public=pd.read_csv("/Users/reneewang/Documents/classAI/mc3_BRCA_mc3.txt", sep='\t')
In [18]:
TCGA_BRCA_MC3
Out[18]:
sample chr start end reference alt gene effect Amino_Acid_Change DNA_VAF SIFT PolyPhen
0 TCGA-3C-AAAU-01 10 122668955 122668955 G A WDR11 3'UTR NaN 0.39 NaN NaN
1 TCGA-3C-AAAU-01 10 8115874 8115875 - A GATA3 Frame_Shift_Ins p.P409Afs*99 0.34 NaN NaN
2 TCGA-3C-AAAU-01 11 65272906 65272908 AAA - MALAT1 RNA NaN 0.27 NaN NaN
3 TCGA-3C-AAAU-01 11 66082467 66082467 C T CD248 Missense_Mutation p.E678K 0.07 tolerated(0.12) benign(0.001)
4 TCGA-3C-AAAU-01 11 66193652 66193652 G C NPAS4 3'UTR NaN 0.20 NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ...
92114 TCGA-Z7-A8R6-01 9 95396703 95396703 C T IPPK Missense_Mutation p.E379K 0.16 deleterious(0.01) probably_damaging(0.968)
92115 TCGA-Z7-A8R6-01 X 123217344 123217344 C T STAG2 Missense_Mutation p.L1000F 0.39 deleterious(0) probably_damaging(1)
92116 TCGA-Z7-A8R6-01 X 30671631 30671631 G A GK 5'UTR NaN 0.36 NaN NaN
92117 TCGA-Z7-A8R6-01 X 51151398 51151398 C G CXorf67 3'UTR NaN 0.32 NaN NaN
92118 TCGA-Z7-A8R6-01 X 54014379 54014379 T A PHF8 Splice_Site p.X613_splice 0.07 NaN NaN

92119 rows × 12 columns

In [19]:
import os
print(os.getcwd())
/Users/reneewang/anaconda_projects/b8b15617-c3b6-46e2-ba19-2b28d4764e55
In [20]:
TCGA_BRCA_MC3_Public
Out[20]:
sample chr start end reference alt gene effect Amino_Acid_Change DNA_VAF SIFT PolyPhen
0 TCGA-3C-AAAU-01 10 122668955 122668955 G A WDR11 3'UTR NaN 0.39 NaN NaN
1 TCGA-3C-AAAU-01 10 8115874 8115875 - A GATA3 Frame_Shift_Ins p.P409Afs*99 0.34 NaN NaN
2 TCGA-3C-AAAU-01 11 65272906 65272908 AAA - MALAT1 RNA NaN 0.27 NaN NaN
3 TCGA-3C-AAAU-01 11 66082467 66082467 C T CD248 Missense_Mutation p.E678K 0.07 tolerated(0.12) benign(0.001)
4 TCGA-3C-AAAU-01 11 66193652 66193652 G C NPAS4 3'UTR NaN 0.20 NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ...
92114 TCGA-Z7-A8R6-01 9 95396703 95396703 C T IPPK Missense_Mutation p.E379K 0.16 deleterious(0.01) probably_damaging(0.968)
92115 TCGA-Z7-A8R6-01 X 123217344 123217344 C T STAG2 Missense_Mutation p.L1000F 0.39 deleterious(0) probably_damaging(1)
92116 TCGA-Z7-A8R6-01 X 30671631 30671631 G A GK 5'UTR NaN 0.36 NaN NaN
92117 TCGA-Z7-A8R6-01 X 51151398 51151398 C G CXorf67 3'UTR NaN 0.32 NaN NaN
92118 TCGA-Z7-A8R6-01 X 54014379 54014379 T A PHF8 Splice_Site p.X613_splice 0.07 NaN NaN

92119 rows × 12 columns

In [23]:
mutation_type_counts=TCGA_BRCA_MC3_Public['effect'].value_counts()
mutation_type_counts
Out[23]:
effect
Missense_Mutation         45634
Silent                    17122
Frame_Shift_Del            8522
3'UTR                      6695
Nonsense_Mutation          3666
Intron                     3212
5'UTR                      2492
Splice_Site                1399
RNA                        1160
Frame_Shift_Ins             610
3'Flank                     530
5'Flank                     443
In_Frame_Del                441
Translation_Start_Site       74
Nonstop_Mutation             66
In_Frame_Ins                 34
large deletion               19
Name: count, dtype: int64
In [24]:
sample_types=TCGA_BRCA_MC3_Public['sample'].value_counts()
sample_types
Out[24]:
sample
TCGA-AC-A23H-01    6405
TCGA-EW-A2FV-01    4231
TCGA-D8-A27V-01    3332
TCGA-5L-AAT1-01    1995
TCGA-BH-A18G-01    1899
                   ... 
TCGA-AO-A03U-01       7
TCGA-A2-A25F-01       6
TCGA-LL-A440-01       6
TCGA-EW-A1P1-01       3
TCGA-AC-A2FK-01       3
Name: count, Length: 791, dtype: int64
In [31]:
gene_types=TCGA_BRCA_MC3_Public['gene'].value_counts()
gene_types
Out[31]:
gene
PIK3CA     315
TTN        285
TP53       273
MUC16      141
CDH1       108
          ... 
ZNF587B      1
BLZF1        1
SLPI         1
PLTP         1
FOXQ1        1
Name: count, Length: 18065, dtype: int64
In [32]:
import matplotlib.pyplot as plt
top_genes = gene_types.head(25)
plt.figure(figsize=(10,6))
top_genes.plot(kind='barh', color='pink', edgecolor='black')
plt.xlabel("Number of Mutations")
plt.ylabel("Gene")
plt.title("Top 25 Most Frequently Mutated Genes in TCGA BRCA (MC3)")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()
No description has been provided for this image
In [33]:
import pandas as pd
import matplotlib.pyplot as plt
In [36]:
TCGA_UCEC_MC3_Public=pd.read_csv("/Users/reneewang/Desktop/classAI/mc3_UCEC_mc3.txt", sep='\t')
In [39]:
TCGA_UCEC_MC3_Public
Out[39]:
sample chr start end reference alt gene effect Amino_Acid_Change DNA_VAF SIFT PolyPhen
0 TCGA-2E-A9G8-01 10 120354589 120354589 C G PRLHR Silent p.V56V 0.22 NaN NaN
1 TCGA-2E-A9G8-01 10 34626217 34626218 - T PARD3 Frame_Shift_Ins p.S852Kfs*8 0.30 NaN NaN
2 TCGA-2E-A9G8-01 10 49939399 49939399 G A WDFY4 Silent p.E458E 0.30 NaN NaN
3 TCGA-2E-A9G8-01 10 50532105 50532105 C T C10orf71 Silent p.D505D 0.38 NaN NaN
4 TCGA-2E-A9G8-01 10 72639765 72639766 TC - SGPL1 3'UTR NaN 0.27 NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ...
798651 TCGA-SL-A6JA-01 X 85117942 85117943 - A CHM 3'UTR NaN 0.44 NaN NaN
798652 TCGA-SL-A6JA-01 X 85118095 85118096 - CC CHM 3'UTR NaN 0.11 NaN NaN
798653 TCGA-SL-A6JA-01 X 86919876 86919876 G A KLHL4 Missense_Mutation p.G680R 0.40 deleterious(0.01) probably_damaging(1)
798654 TCGA-SL-A6JA-01 X 92966183 92966184 - T FAM133A 3'UTR NaN 0.30 NaN NaN
798655 TCGA-SL-A6JA-01 X 9864479 9864479 G A SHROOM2 Missense_Mutation p.R844H 0.35 tolerated(0.1) benign(0)

798656 rows × 12 columns

In [40]:
mutation_type_counts=TCGA_UCEC_MC3_Public['effect'].value_counts()
mutation_type_counts
Out[40]:
effect
Missense_Mutation         387884
Silent                    144249
3'UTR                     122921
Nonsense_Mutation          36485
Intron                     28109
5'UTR                      18233
Frame_Shift_Del            14898
RNA                        14087
Splice_Site                 9617
3'Flank                     9260
Frame_Shift_Ins             6225
5'Flank                     4523
In_Frame_Del                1320
Translation_Start_Site       369
Nonstop_Mutation             357
In_Frame_Ins                  90
large deletion                29
Name: count, dtype: int64
In [41]:
sample_types=TCGA_UCEC_MC3_Public['sample'].value_counts()
sample_types
Out[41]:
sample
TCGA-EO-A22U-01    25534
TCGA-FI-A2D5-01    24192
TCGA-AX-A2HC-01    23087
TCGA-EO-A22R-01    22823
TCGA-B5-A3FC-01    22662
                   ...  
TCGA-AX-A2HF-01       32
TCGA-FI-A2CX-01       32
TCGA-AX-A1CC-01       30
TCGA-BG-A0YU-01       30
TCGA-QF-A5YT-01       12
Name: count, Length: 447, dtype: int64
In [42]:
gene_types=TCGA_UCEC_MC3_Public['gene'].value_counts()
gene_types
Out[42]:
gene
TTN           2409
MUC16          726
PTEN           584
DST            576
CSMD3          535
              ... 
SEC24B-AS1       1
MIRLET7C         1
AC002519.8       1
TRAJ39           1
AP006285.6       1
Name: count, Length: 20957, dtype: int64
In [43]:
import matplotlib.pyplot as plt
top_genes = gene_types.head(25)
plt.figure(figsize=(10,6))
top_genes.plot(kind='barh', color='purple', edgecolor='black')
plt.xlabel("Number of Mutations")
plt.ylabel("Gene")
plt.title("Top 25 Most Frequently Mutated Genes in TCGA UCEC (MC3)")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]: