In [3]:
import pandas as pd
import numpy as np
In [6]:
integrins = pd.read_excel("/Users/reneewang/Downloads/gtex_integrin_7_organs.xlsx")
integrins
Out[6]:
Unnamed: 0 | primary_site | ITGA10 | ITGAD | ITGAM | ITGA3 | ITGBL1 | ITGAE | ITGA2 | ITGB3 | ... | ITGA6 | ITGA2B | ITGB1 | ITGAL | ITGA9 | ITGB5 | ITGA8 | ITGA4 | ITGA1 | ITGA11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | GTEX-13QIC-0011-R1a-SM-5O9CJ | Brain | 0.5763 | -6.5064 | 2.2573 | 0.7832 | 1.0363 | 4.6035 | 2.5731 | -2.8262 | ... | 2.8562 | 1.3846 | 5.8430 | 1.1316 | -0.7108 | 3.5387 | -0.0725 | -0.4521 | 0.2029 | -2.8262 |
1 | GTEX-1399S-1726-SM-5L3DI | Lung | 4.9137 | -3.6259 | 4.7307 | 7.1584 | 1.7702 | 4.9556 | 1.9149 | 2.6067 | ... | 4.2412 | 4.1211 | 7.7256 | 4.4900 | 2.9281 | 6.1483 | 5.1867 | 2.6185 | 4.7856 | -0.0277 |
2 | GTEX-PWCY-1326-SM-48TCU | Ovary | 2.3953 | -5.0116 | 1.4547 | 4.2593 | -0.7346 | 4.4149 | 0.2642 | 1.5216 | ... | 3.6816 | 1.5465 | 7.2964 | -0.9406 | 2.7742 | 5.0414 | 2.0325 | 0.7579 | 2.2573 | 1.2516 |
3 | GTEX-QXCU-0626-SM-2TC69 | Lung | 4.0541 | -2.3147 | 4.5053 | 7.5651 | 4.1788 | 4.1772 | 5.3695 | 1.8444 | ... | 4.9631 | 1.9149 | 7.9947 | 3.3911 | 2.8462 | 6.7683 | 4.1636 | 2.7951 | 5.3284 | 1.2147 |
4 | GTEX-ZA64-1526-SM-5CVMD | Breast | 2.0569 | -2.4659 | 3.3993 | 3.1311 | 3.0074 | 4.4977 | -1.7809 | 2.7139 | ... | 4.7340 | 0.6332 | 7.3496 | -0.9406 | 2.5338 | 6.5696 | 1.7229 | -0.6416 | 3.1195 | 1.1050 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1982 | GTEX-QMRM-0826-SM-3NB33 | Lung | 5.3067 | -3.8160 | 4.9065 | 7.5810 | 5.8714 | 4.7345 | 2.6185 | 3.1095 | ... | 5.6080 | 3.7324 | 8.2849 | 4.6201 | 3.6440 | 6.7052 | 5.1094 | 3.3364 | 5.8153 | 1.6604 |
1983 | GTEX-YFCO-1626-SM-4W1Z3 | Prostate | 2.9581 | -4.6082 | 1.1641 | 4.6938 | 1.5902 | 5.8625 | -0.5125 | 1.7617 | ... | 3.8798 | -1.4699 | 7.5163 | -0.3752 | 2.9562 | 5.3035 | 4.4304 | -0.9406 | 3.6136 | 0.4233 |
1984 | GTEX-1117F-2826-SM-5GZXL | Breast | 4.3184 | -6.5064 | 1.0433 | 4.8440 | 3.5498 | 4.6809 | 1.0293 | 3.3478 | ... | 5.3256 | -0.0725 | 7.7516 | 1.1382 | 2.1411 | 7.1132 | 0.3796 | 0.0854 | 3.8650 | 1.0151 |
1985 | GTEX-Q2AG-2826-SM-2HMJQ | Brain | 3.4622 | -5.5735 | 1.5013 | 5.4835 | 1.7702 | 4.7517 | 0.6790 | -3.1714 | ... | 1.1960 | 4.1740 | 4.3002 | 0.5470 | -0.9971 | 3.7982 | -0.2498 | 1.4808 | -0.5125 | -0.5125 |
1986 | GTEX-XV7Q-0426-SM-4BRVN | Lung | 2.5585 | -1.7809 | 6.7916 | 6.5865 | 2.7051 | 4.9519 | 4.3618 | 3.1892 | ... | 3.5779 | 2.8974 | 7.7685 | 4.8294 | 1.9149 | 5.9989 | 2.4117 | 2.4198 | 4.2080 | 1.0007 |
1987 rows × 29 columns
In [7]:
brain_integrins = integrins[integrins['primary_site'] == 'Brain']
brain_integrins
Out[7]:
Unnamed: 0 | primary_site | ITGA10 | ITGAD | ITGAM | ITGA3 | ITGBL1 | ITGAE | ITGA2 | ITGB3 | ... | ITGA6 | ITGA2B | ITGB1 | ITGAL | ITGA9 | ITGB5 | ITGA8 | ITGA4 | ITGA1 | ITGA11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | GTEX-13QIC-0011-R1a-SM-5O9CJ | Brain | 0.5763 | -6.5064 | 2.2573 | 0.7832 | 1.0363 | 4.6035 | 2.5731 | -2.8262 | ... | 2.8562 | 1.3846 | 5.8430 | 1.1316 | -0.7108 | 3.5387 | -0.0725 | -0.4521 | 0.2029 | -2.8262 |
8 | GTEX-N7MS-2526-SM-26GMA | Brain | 2.2960 | -9.9658 | 0.6608 | 5.2840 | 0.4233 | 4.8510 | -0.2671 | -0.1031 | ... | 1.5415 | 4.6623 | 3.4687 | 0.5666 | -0.0130 | 3.0654 | 0.7916 | 1.0433 | -0.7346 | -0.7588 |
10 | GTEX-N7MS-2526-SM-26GMR | Brain | -0.2498 | -9.9658 | -0.8863 | 3.1685 | -1.6394 | 2.8158 | -0.4719 | -1.1488 | ... | 1.6045 | 0.9268 | 2.8055 | -0.5973 | 0.4657 | 1.8918 | 0.3460 | 0.3907 | -1.9942 | -1.5522 |
12 | GTEX-NPJ7-0011-R6a-SM-2I3G7 | Brain | 1.6045 | -6.5064 | 2.3193 | 3.6335 | -2.3147 | 5.0670 | -0.8863 | -0.8084 | ... | 3.2018 | 1.7575 | 4.6894 | 0.4125 | -0.6643 | 3.6916 | -0.6193 | -2.2447 | 1.2023 | -1.9942 |
14 | GTEX-132Q8-3026-SM-5PNVG | Brain | 2.8974 | -6.5064 | 1.9601 | 4.1836 | -0.8084 | 4.5892 | -0.5543 | 0.3460 | ... | 3.6018 | 2.7931 | 4.7274 | -0.0574 | 1.2271 | 4.3793 | 0.8488 | -0.2159 | 2.1378 | -0.6416 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1977 | GTEX-13G51-0011-R6b-SM-5LZX4 | Brain | -0.3383 | -6.5064 | 1.6234 | 2.7487 | -2.2447 | 5.2415 | -0.8863 | -2.9324 | ... | 2.1988 | 0.4016 | 4.5142 | -1.1811 | -0.8084 | 3.9983 | -1.0862 | -3.1714 | -0.7588 | -1.9379 |
1978 | GTEX-YFC4-0011-R10a-SM-4SOK5 | Brain | 0.4447 | -5.5735 | 0.3231 | 3.5237 | -1.5105 | 4.9016 | 0.9419 | -2.7274 | ... | 2.8178 | 1.3567 | 4.4621 | -0.2845 | 1.0222 | 3.3336 | 0.1903 | -1.0559 | 0.0300 | -0.4719 |
1980 | GTEX-13112-0011-R4b-SM-5DUXL | Brain | 0.6969 | -6.5064 | -0.9686 | 2.3760 | -2.2447 | 4.0739 | -0.6193 | -4.0350 | ... | 2.7357 | 1.5806 | 4.6882 | -0.9971 | -0.5756 | 3.5136 | 0.9343 | -1.0862 | 0.4340 | -2.2447 |
1981 | GTEX-1313W-0011-R1b-SM-5EQ4A | Brain | 0.1124 | -5.0116 | 2.2482 | 2.8897 | -0.5125 | 4.6445 | 0.3115 | -3.6259 | ... | 2.1147 | 0.9716 | 5.1202 | 0.6608 | 0.4761 | 3.2343 | 0.8408 | -0.0574 | -0.1828 | -2.5479 |
1985 | GTEX-Q2AG-2826-SM-2HMJQ | Brain | 3.4622 | -5.5735 | 1.5013 | 5.4835 | 1.7702 | 4.7517 | 0.6790 | -3.1714 | ... | 1.1960 | 4.1740 | 4.3002 | 0.5470 | -0.9971 | 3.7982 | -0.2498 | 1.4808 | -0.5125 | -0.5125 |
1152 rows × 29 columns
In [9]:
# Import the required libraries
import matplotlib.pyplot as plt
import seaborn as sns
# violin plot for all the genes of the brain
plt.figure(figsize = (16, 6))
sns.violinplot(data = brain_integrins)
plt.title("Integrin Genes of the Brain")
plt.xlabel("Integrin Genes")
plt.ylabel("Gene Expression Levels")
plt.show()
In [10]:
lung_integrins = integrins[integrins['primary_site'] == 'Lung']
lung_integrins
Out[10]:
Unnamed: 0 | primary_site | ITGA10 | ITGAD | ITGAM | ITGA3 | ITGBL1 | ITGAE | ITGA2 | ITGB3 | ... | ITGA6 | ITGA2B | ITGB1 | ITGAL | ITGA9 | ITGB5 | ITGA8 | ITGA4 | ITGA1 | ITGA11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | GTEX-1399S-1726-SM-5L3DI | Lung | 4.9137 | -3.6259 | 4.7307 | 7.1584 | 1.7702 | 4.9556 | 1.9149 | 2.6067 | ... | 4.2412 | 4.1211 | 7.7256 | 4.4900 | 2.9281 | 6.1483 | 5.1867 | 2.6185 | 4.7856 | -0.0277 |
3 | GTEX-QXCU-0626-SM-2TC69 | Lung | 4.0541 | -2.3147 | 4.5053 | 7.5651 | 4.1788 | 4.1772 | 5.3695 | 1.8444 | ... | 4.9631 | 1.9149 | 7.9947 | 3.3911 | 2.8462 | 6.7683 | 4.1636 | 2.7951 | 5.3284 | 1.2147 |
5 | GTEX-11EI6-0826-SM-5985V | Lung | 6.0732 | -2.4659 | 3.9901 | 7.3945 | 4.7688 | 5.1157 | 4.3356 | 2.3366 | ... | 3.7378 | 4.7247 | 7.5016 | 5.1396 | 2.5036 | 6.5443 | 4.6531 | 3.8136 | 5.8679 | 0.7407 |
6 | GTEX-S341-0326-SM-2XCAU | Lung | 4.2510 | -5.0116 | 3.3076 | 6.1715 | 3.1129 | 5.2954 | 2.2960 | 1.1184 | ... | 4.7104 | 2.7530 | 7.5022 | 4.0730 | 2.6325 | 6.0483 | 5.0562 | 2.6962 | 5.1611 | 0.9343 |
7 | GTEX-WY7C-0426-SM-3NB3C | Lung | 3.3633 | -2.5479 | 4.8340 | 6.6864 | 3.0585 | 4.8294 | 2.6464 | 0.7999 | ... | 5.1190 | 1.5013 | 8.0260 | 3.6635 | 3.2435 | 5.8503 | 5.2991 | 2.8076 | 4.7571 | -0.1345 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1962 | GTEX-Q2AH-0426-SM-2I3EP | Lung | 5.9644 | -1.3921 | 5.1061 | 6.9470 | 3.8973 | 4.8630 | 3.6089 | 3.9765 | ... | 5.1115 | 4.9041 | 7.9145 | 4.5559 | 3.7138 | 6.5782 | 4.7512 | 2.9710 | 5.0777 | 1.8444 |
1970 | GTEX-RWS6-0226-SM-2XCA9 | Lung | 6.0830 | -0.5756 | 4.3889 | 6.7302 | 4.6053 | 5.1065 | 2.8321 | 0.9716 | ... | 5.8176 | 2.5437 | 7.7929 | 4.9012 | 2.7993 | 6.7510 | 5.2204 | 2.8422 | 5.0951 | -0.3201 |
1975 | GTEX-131XE-0726-SM-5HL9K | Lung | 3.7971 | -1.9379 | 4.8555 | 6.4052 | 3.9561 | 5.4263 | 3.2959 | 4.5199 | ... | 4.6697 | 6.5777 | 7.5114 | 5.2130 | 2.3816 | 6.6225 | 3.7389 | 3.7248 | 5.6809 | 0.8488 |
1982 | GTEX-QMRM-0826-SM-3NB33 | Lung | 5.3067 | -3.8160 | 4.9065 | 7.5810 | 5.8714 | 4.7345 | 2.6185 | 3.1095 | ... | 5.6080 | 3.7324 | 8.2849 | 4.6201 | 3.6440 | 6.7052 | 5.1094 | 3.3364 | 5.8153 | 1.6604 |
1986 | GTEX-XV7Q-0426-SM-4BRVN | Lung | 2.5585 | -1.7809 | 6.7916 | 6.5865 | 2.7051 | 4.9519 | 4.3618 | 3.1892 | ... | 3.5779 | 2.8974 | 7.7685 | 4.8294 | 1.9149 | 5.9989 | 2.4117 | 2.4198 | 4.2080 | 1.0007 |
288 rows × 29 columns
In [11]:
#violin plot for all the genes of the lung
plt.figure(figsize = (16, 6))
sns.violinplot(data = lung_integrins)
plt.title("Integrin Genes of the Lung")
plt.xlabel("Integrin Genes")
plt.ylabel("Gene Expression Levels")
plt.show()
In [12]:
brain_lung_integrins = integrins[integrins['primary_site'].isin(['Brain', 'Lung'])] #filter data by organ, display both brain and lung data
brain_lung_integrins
Out[12]:
Unnamed: 0 | primary_site | ITGA10 | ITGAD | ITGAM | ITGA3 | ITGBL1 | ITGAE | ITGA2 | ITGB3 | ... | ITGA6 | ITGA2B | ITGB1 | ITGAL | ITGA9 | ITGB5 | ITGA8 | ITGA4 | ITGA1 | ITGA11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | GTEX-13QIC-0011-R1a-SM-5O9CJ | Brain | 0.5763 | -6.5064 | 2.2573 | 0.7832 | 1.0363 | 4.6035 | 2.5731 | -2.8262 | ... | 2.8562 | 1.3846 | 5.8430 | 1.1316 | -0.7108 | 3.5387 | -0.0725 | -0.4521 | 0.2029 | -2.8262 |
1 | GTEX-1399S-1726-SM-5L3DI | Lung | 4.9137 | -3.6259 | 4.7307 | 7.1584 | 1.7702 | 4.9556 | 1.9149 | 2.6067 | ... | 4.2412 | 4.1211 | 7.7256 | 4.4900 | 2.9281 | 6.1483 | 5.1867 | 2.6185 | 4.7856 | -0.0277 |
3 | GTEX-QXCU-0626-SM-2TC69 | Lung | 4.0541 | -2.3147 | 4.5053 | 7.5651 | 4.1788 | 4.1772 | 5.3695 | 1.8444 | ... | 4.9631 | 1.9149 | 7.9947 | 3.3911 | 2.8462 | 6.7683 | 4.1636 | 2.7951 | 5.3284 | 1.2147 |
5 | GTEX-11EI6-0826-SM-5985V | Lung | 6.0732 | -2.4659 | 3.9901 | 7.3945 | 4.7688 | 5.1157 | 4.3356 | 2.3366 | ... | 3.7378 | 4.7247 | 7.5016 | 5.1396 | 2.5036 | 6.5443 | 4.6531 | 3.8136 | 5.8679 | 0.7407 |
6 | GTEX-S341-0326-SM-2XCAU | Lung | 4.2510 | -5.0116 | 3.3076 | 6.1715 | 3.1129 | 5.2954 | 2.2960 | 1.1184 | ... | 4.7104 | 2.7530 | 7.5022 | 4.0730 | 2.6325 | 6.0483 | 5.0562 | 2.6962 | 5.1611 | 0.9343 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1980 | GTEX-13112-0011-R4b-SM-5DUXL | Brain | 0.6969 | -6.5064 | -0.9686 | 2.3760 | -2.2447 | 4.0739 | -0.6193 | -4.0350 | ... | 2.7357 | 1.5806 | 4.6882 | -0.9971 | -0.5756 | 3.5136 | 0.9343 | -1.0862 | 0.4340 | -2.2447 |
1981 | GTEX-1313W-0011-R1b-SM-5EQ4A | Brain | 0.1124 | -5.0116 | 2.2482 | 2.8897 | -0.5125 | 4.6445 | 0.3115 | -3.6259 | ... | 2.1147 | 0.9716 | 5.1202 | 0.6608 | 0.4761 | 3.2343 | 0.8408 | -0.0574 | -0.1828 | -2.5479 |
1982 | GTEX-QMRM-0826-SM-3NB33 | Lung | 5.3067 | -3.8160 | 4.9065 | 7.5810 | 5.8714 | 4.7345 | 2.6185 | 3.1095 | ... | 5.6080 | 3.7324 | 8.2849 | 4.6201 | 3.6440 | 6.7052 | 5.1094 | 3.3364 | 5.8153 | 1.6604 |
1985 | GTEX-Q2AG-2826-SM-2HMJQ | Brain | 3.4622 | -5.5735 | 1.5013 | 5.4835 | 1.7702 | 4.7517 | 0.6790 | -3.1714 | ... | 1.1960 | 4.1740 | 4.3002 | 0.5470 | -0.9971 | 3.7982 | -0.2498 | 1.4808 | -0.5125 | -0.5125 |
1986 | GTEX-XV7Q-0426-SM-4BRVN | Lung | 2.5585 | -1.7809 | 6.7916 | 6.5865 | 2.7051 | 4.9519 | 4.3618 | 3.1892 | ... | 3.5779 | 2.8974 | 7.7685 | 4.8294 | 1.9149 | 5.9989 | 2.4117 | 2.4198 | 4.2080 | 1.0007 |
1440 rows × 29 columns
In [23]:
# First, define the data_brain_lung variable before using it
# For example, you might need to load your data from a file:
#data_brain_lung = pd.read_csv('/Users/reneewang/Downloads/gtex_integrin_7_organs.xlsx')
brain_lung_integrins_expression_only = brain_lung_integrins.iloc[:, 1:]
brain_lung_integrins_expression_only
Out[23]:
primary_site | ITGA10 | ITGAD | ITGAM | ITGA3 | ITGBL1 | ITGAE | ITGA2 | ITGB3 | ITGA7 | ... | ITGA6 | ITGA2B | ITGB1 | ITGAL | ITGA9 | ITGB5 | ITGA8 | ITGA4 | ITGA1 | ITGA11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Brain | 0.5763 | -6.5064 | 2.2573 | 0.7832 | 1.0363 | 4.6035 | 2.5731 | -2.8262 | 4.9663 | ... | 2.8562 | 1.3846 | 5.8430 | 1.1316 | -0.7108 | 3.5387 | -0.0725 | -0.4521 | 0.2029 | -2.8262 |
1 | Lung | 4.9137 | -3.6259 | 4.7307 | 7.1584 | 1.7702 | 4.9556 | 1.9149 | 2.6067 | 3.9270 | ... | 4.2412 | 4.1211 | 7.7256 | 4.4900 | 2.9281 | 6.1483 | 5.1867 | 2.6185 | 4.7856 | -0.0277 |
3 | Lung | 4.0541 | -2.3147 | 4.5053 | 7.5651 | 4.1788 | 4.1772 | 5.3695 | 1.8444 | 4.5355 | ... | 4.9631 | 1.9149 | 7.9947 | 3.3911 | 2.8462 | 6.7683 | 4.1636 | 2.7951 | 5.3284 | 1.2147 |
5 | Lung | 6.0732 | -2.4659 | 3.9901 | 7.3945 | 4.7688 | 5.1157 | 4.3356 | 2.3366 | 5.0527 | ... | 3.7378 | 4.7247 | 7.5016 | 5.1396 | 2.5036 | 6.5443 | 4.6531 | 3.8136 | 5.8679 | 0.7407 |
6 | Lung | 4.2510 | -5.0116 | 3.3076 | 6.1715 | 3.1129 | 5.2954 | 2.2960 | 1.1184 | 5.2392 | ... | 4.7104 | 2.7530 | 7.5022 | 4.0730 | 2.6325 | 6.0483 | 5.0562 | 2.6962 | 5.1611 | 0.9343 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1980 | Brain | 0.6969 | -6.5064 | -0.9686 | 2.3760 | -2.2447 | 4.0739 | -0.6193 | -4.0350 | 4.8788 | ... | 2.7357 | 1.5806 | 4.6882 | -0.9971 | -0.5756 | 3.5136 | 0.9343 | -1.0862 | 0.4340 | -2.2447 |
1981 | Brain | 0.1124 | -5.0116 | 2.2482 | 2.8897 | -0.5125 | 4.6445 | 0.3115 | -3.6259 | 4.5110 | ... | 2.1147 | 0.9716 | 5.1202 | 0.6608 | 0.4761 | 3.2343 | 0.8408 | -0.0574 | -0.1828 | -2.5479 |
1982 | Lung | 5.3067 | -3.8160 | 4.9065 | 7.5810 | 5.8714 | 4.7345 | 2.6185 | 3.1095 | 5.2032 | ... | 5.6080 | 3.7324 | 8.2849 | 4.6201 | 3.6440 | 6.7052 | 5.1094 | 3.3364 | 5.8153 | 1.6604 |
1985 | Brain | 3.4622 | -5.5735 | 1.5013 | 5.4835 | 1.7702 | 4.7517 | 0.6790 | -3.1714 | 5.3597 | ... | 1.1960 | 4.1740 | 4.3002 | 0.5470 | -0.9971 | 3.7982 | -0.2498 | 1.4808 | -0.5125 | -0.5125 |
1986 | Lung | 2.5585 | -1.7809 | 6.7916 | 6.5865 | 2.7051 | 4.9519 | 4.3618 | 3.1892 | 7.7121 | ... | 3.5779 | 2.8974 | 7.7685 | 4.8294 | 1.9149 | 5.9989 | 2.4117 | 2.4198 | 4.2080 | 1.0007 |
1440 rows × 28 columns
In [25]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
#define wwhat is X and what is Y in your model
X=brain_lung_integrins_expression_only[['ITGA10']]
y=brain_lung_integrins_expression_only['primary_site']
#define split between train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
#define the model you want to use : logistic regression
model = LogisticRegression()
model.fit(X_train, y_train)
# predict and evaluate
y_pred = model.predict(X_test)
accuracy=accuracy_score(y_test,y_pred)
print(f"Accuracy using ITGA10: {accuracy:.2f}")
Accuracy using ITGA10: 0.94
In [14]:
brain_lung_integrins = integrins[integrins['primary_site'].isin(['Brain', 'Lung'])] #filter data by organ, display both brain and lungdata
#rearrange data
brain_lung_integrins_vertical = brain_lung_integrins_expression_only.melt(id_vars = 'primary_site', var_name = 'integrin_gene', value_name = 'expression_levels')
brain_lung_integrins_vertical
Out[14]:
primary_site | integrin_gene | expression_levels | |
---|---|---|---|
0 | Brain | ITGA10 | 0.5763 |
1 | Lung | ITGA10 | 4.9137 |
2 | Lung | ITGA10 | 4.0541 |
3 | Lung | ITGA10 | 6.0732 |
4 | Lung | ITGA10 | 4.2510 |
... | ... | ... | ... |
38875 | Brain | ITGA11 | -2.2447 |
38876 | Brain | ITGA11 | -2.5479 |
38877 | Lung | ITGA11 | 1.6604 |
38878 | Brain | ITGA11 | -0.5125 |
38879 | Lung | ITGA11 | 1.0007 |
38880 rows × 3 columns
In [27]:
#switch ITGA10 to ITGB4 and see how that impact its accuracy
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
#define wwhat is X and what is Y in your model
X=brain_lung_integrins_expression_only[['ITGB4']]
y=brain_lung_integrins_expression_only['primary_site']
#define split between train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
#define the model you want to use : logistic regression
model = LogisticRegression()
model.fit(X_train, y_train)
# predict and evaluate
y_pred = model.predict(X_test)
accuracy=accuracy_score(y_test,y_pred)
print(f"Accuracy using ITGB4: {accuracy:.2f}")
Accuracy using ITGB4: 0.81
In [15]:
plt.figure(figsize=(16, 6))
sns.violinplot(x = 'integrin_gene', y = 'expression_levels', hue = 'primary_site', data = brain_lung_integrins_vertical, split = True, inner = 'quartile')
plt.title("Integrin Genes of the Brain vs. the Lung")
plt.xlabel("Integrin Gene")
plt.ylabel("Gene Expression Levels")
plt.legend(title = 'primary_site')
plt.show()
In [28]:
## model accuracy is defined as: number of correction predictions/total number of predictions
In [33]:
#AUROC curve
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
#Step 1: Prepare data
X = brain_lung_integrins_expression_only[['ITGB4']] # 👈 Use your chosen integrin
y = brain_lung_integrins_expression_only['primary_site'].map({'Brain': 0, 'Lung': 1}) # Binary encoding
# Step 2: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Step 3: Train model
model = LogisticRegression()
model.fit(X_train, y_train)
# Step 4: Predict probabilities
y_proba = model.predict_proba(X_test)[:, 1] # Probabilities for class "Lung"
# Step 5: Compute ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)
# Step 6: Plot
plt.figure(figsize=(6, 6))
plt.plot(fpr, tpr, label=f'AUC = {auc:.2f}')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray') # random guess line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Brain vs Lung) using ITGB4 Expression')
plt.legend()
plt.grid(True)
plt.show()
In [32]:
#AUROC curve
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
#Step 1: Prepare data
X = brain_lung_integrins_expression_only[['ITGA10']] # 👈 Use your chosen integrin
y = brain_lung_integrins_expression_only['primary_site'].map({'Brain': 0, 'Lung': 1}) # Binary encoding
# Step 2: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Step 3: Train model
model = LogisticRegression()
model.fit(X_train, y_train)
# Step 4: Predict probabilities
y_proba = model.predict_proba(X_test)[:, 1] # Probabilities for class "Lung"
# Step 5: Compute ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)
# Step 6: Plot
plt.figure(figsize=(6, 6))
plt.plot(fpr, tpr, label=f'AUC = {auc:.2f}')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray') # random guess line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Brain vs Lung) using ITGA10 Expression')
plt.legend()
plt.grid(True)
plt.show()
In [37]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
# Step 1: Prepare data
X = brain_lung_integrins_expression_only[['ITGA3', 'ITGB4']] # 👈 Include both integrins
y = brain_lung_integrins_expression_only['primary_site'].map({'Brain': 0, 'Lung': 1}) # Binary target
# Step 2: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Step 3: Train model
model = LogisticRegression()
model.fit(X_train, y_train)
# Step 4: Predict probabilities
y_proba = model.predict_proba(X_test)[:, 1] # Probabilities for class "Lung"
# Step 5: Compute ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)
# Step 6: Plot
plt.figure(figsize=(6, 6))
plt.plot(fpr, tpr, label=f'AUC = {auc:.2f}')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray') # random guess line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Brain vs Lung) using ITGA3 & ITGB4 Expression')
plt.legend()
plt.grid(True)
plt.show()
In [38]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
# Step 1: Prepare features and target
selected_genes = ['ITGA10', 'ITGB4']
#X = integrins.iloc[:, -27:] # Assuming the last 27 columns are integrins
X = integrins[selected_genes] # Assuming the last 27 columns are integrins
y = integrins['primary_site']
# Step 2: Encode organ labels as numbers
le = LabelEncoder()
y_encoded = le.fit_transform(y)
# Step 3: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)
# Step 4: Train multinomial logistic regression
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
model.fit(X_train, y_train)
# Step 5: Predict and evaluate
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
#a confusion matrix is a table that summarizes the performance of a classification model. It visualizes the counts of true positives, true negatives, false positives, and false negatives, allowing for a detailed analysis of how well a model is predicting different classes. The columns typically represent predicted classes, while the rows represent actual (or true) classes. The values within the matrix represent the counts of instances that fall into each combination of predicted and actual classes.
Accuracy: 0.7939698492462312 Classification Report: precision recall f1-score support Bone Marrow 0.77 1.00 0.87 10 Brain 0.81 0.94 0.87 247 Breast 0.64 0.41 0.50 44 Liver 1.00 0.65 0.79 23 Lung 0.76 0.88 0.82 43 Ovary 0.50 0.10 0.17 10 Prostate 0.75 0.14 0.24 21 accuracy 0.79 398 macro avg 0.75 0.59 0.61 398 weighted avg 0.78 0.79 0.77 398 Confusion Matrix: [[ 10 0 0 0 0 0 0] [ 3 231 3 0 8 1 1] [ 0 25 18 0 1 0 0] [ 0 8 0 15 0 0 0] [ 0 4 1 0 38 0 0] [ 0 6 0 0 3 1 0] [ 0 12 6 0 0 0 3]]
/opt/anaconda3/lib/python3.12/site-packages/sklearn/linear_model/_logistic.py:1264: FutureWarning: 'multi_class' was deprecated in version 1.5 and will be removed in 1.7. From then on, it will always use 'multinomial'. Leave it to its default value to avoid this warning. warnings.warn(
In [40]:
integrins['primary_site'].value_counts()
Out[40]:
primary_site Brain 1152 Lung 288 Breast 179 Liver 110 Prostate 100 Ovary 88 Bone Marrow 70 Name: count, dtype: int64
In [ ]: