wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_36/gencode.v36.annotation.gtf.gz
mv gencode.v36.annotation.gtf.gz ./data

#install rnanorm
pip install rnanorm

# import packages
import pandas as pd
from rnanorm import FPKM, TPM, CPM, TMM

# convert count to TPM based the gtf file
gtf_path = "https://www.immuno-compass.com/download/other/gencode.v36.annotation.gtf.gz"
tpm = TPM(gtf_path).set_output(transform="pandas")

# example of the raw counts
df_counts = pd.read_csv('https://www.immuno-compass.com/download/other/toy_raw_counts.csv', index_col=0)
df_counts.head()

# example of the TPM values
df_tpm = tpm.fit_transform(df_counts)
df_tpm.to_csv('./toy_tpm.csv')
df_tpm.head()

df_cancer_type = pd.DataFrame([], index = df_counts.index)
df_cancer_type['cancer_type'] = 'SKCM'
df_cancer_type.head()

import json
cancer_code_map = pd.read_json('https://www.immuno-compass.com/download/other/cancer_code.json',
                               orient= 'index')[0]
df_cancer_type['cancer_type'] = df_cancer_type['cancer_type'].map(cancer_code_map)
df_cancer_type.head()

gene_map = pd.read_csv('https://www.immuno-compass.com/download/other/compass_gene_map.csv')
gene_map.head()

df_tpm_input = df_tpm[gene_map.ensid_v36]
df_tpm_input.columns = df_tpm_input.columns.map(gene_map.set_index('ensid_v36').gene_name)
df_tpm_input.shape

(25, 15672)

df_tpm_input.head()

#### Step3. Generate the inputs and save them
df_inputs = df_cancer_type.join(df_tpm_input)
df_inputs.head()

df_inputs.to_csv('./compass_inputs.csv')

	ENSG00000223972.5	ENSG00000227232.5	ENSG00000278267.1	ENSG00000237613.2	ENSG00000238009.6	...	ENSG00000198886.2	ENSG00000210176.1	ENSG00000210184.1	ENSG00000210191.1	ENSG00000198786.2	ENSG00000198695.2	ENSG00000198727.2	ENSG00000210196.2
ERR2208944	6	201	0	1	0	...	1376	0	0	0	947	178	582	4
ERR2208928	0	222	1	0	0	...	2263	0	0	0	2549	459	1486	0
ERR2208949	1	487	0	3	0	...	2544	1	0	0	1783	377	745	4
ERR2208900	13	569	0	14	4	...	13168	4	3	1	10988	2702	3746	101
ERR2208922	0	29	1	0	0	...	14029	2	0	0	5480	1302	4160	6

	ENSG00000223972.5	ENSG00000227232.5	ENSG00000278267.1	ENSG00000237613.2	ENSG00000238009.6	...	ENSG00000198886.2	ENSG00000210176.1	ENSG00000210184.1	ENSG00000210191.1	ENSG00000198786.2	ENSG00000198695.2	ENSG00000198727.2	ENSG00000210196.2
ERR2208944	0.460737	19.821755	0.000000	0.109294	0.000000	...	133.036440	0.000000	0.000000	0.000000	69.629485	45.171249	67.957710	7.837047
ERR2208928	0.000000	25.382838	2.271609	0.000000	0.000000	...	253.675132	0.000000	0.000000	0.000000	217.297235	135.050420	201.175794	0.000000
ERR2208949	0.069637	43.552412	0.000000	0.297342	0.000000	...	223.052187	1.751014	0.000000	0.000000	118.886282	86.760220	78.887687	7.107055
ERR2208900	0.172087	9.672981	0.000000	0.263771	0.024656	...	219.469413	1.331418	1.167811	0.323478	139.272015	118.203257	75.402463	34.112682
ERR2208922	0.000000	2.360453	1.617126	0.000000	0.000000	...	1119.515705	3.187378	0.000000	0.000000	332.563864	272.712079	400.922453	9.702754

Study Abbreviation	Study Name
LAML	Acute Myeloid Leukemia
ACC	Adrenocortical carcinoma
BLCA	Bladder Urothelial Carcinoma
LGG	Brain Lower Grade Glioma
BRCA	Breast invasive carcinoma
CESC	Cervical squamous cell carcinoma and endocervical adenocarcinoma
CHOL	Cholangiocarcinoma
LCML	Chronic Myelogenous Leukemia
COAD	Colon adenocarcinoma
CNTL	Controls
ESCA	Esophageal carcinoma
FPPP	FFPE Pilot Phase II
GBM	Glioblastoma multiforme
HNSC	Head and Neck squamous cell carcinoma
KICH	Kidney Chromophobe
KIRC	Kidney renal clear cell carcinoma
KIRP	Kidney renal papillary cell carcinoma
LIHC	Liver hepatocellular carcinoma
LUAD	Lung adenocarcinoma
LUSC	Lung squamous cell carcinoma
DLBC	Lymphoid Neoplasm Diffuse Large B-cell Lymphoma
MESO	Mesothelioma
MISC	Miscellaneous
OV	Ovarian serous cystadenocarcinoma
PAAD	Pancreatic adenocarcinoma
PCPG	Pheochromocytoma and Paraganglioma
PRAD	Prostate adenocarcinoma
READ	Rectum adenocarcinoma
SARC	Sarcoma
SKCM	Skin Cutaneous Melanoma
STAD	Stomach adenocarcinoma
TGCT	Testicular Germ Cell Tumors
THYM	Thymoma
THCA	Thyroid carcinoma
UCS	Uterine Carcinosarcoma
UCEC	Uterine Corpus Endometrial Carcinoma
UVM	Uveal Melanoma

	ensid	gene_name	ensid_v36	gene_type	gene_supertype	entrezgene
0	ENSG00000121410	A1BG	ENSG00000121410.12	protein_coding	protein_coding	1.0
1	ENSG00000148584	A1CF	ENSG00000148584.15	protein_coding	protein_coding	29974.0
2	ENSG00000175899	A2M	ENSG00000175899.15	protein_coding	protein_coding	2.0
3	ENSG00000166535	A2ML1	ENSG00000166535.20	protein_coding	protein_coding	144568.0
4	ENSG00000128274	A4GALT	ENSG00000128274.17	protein_coding	protein_coding	53947.0

	A1BG	A1CF	A2M	A2ML1	A4GALT	A4GNT	AAAS	AACS	AADAC	AADAT	...	ZWILCH	ZWINT	ZXDA	ZXDB	ZXDC	ZYG11A	ZYG11B	ZYX	ZZEF1	ZZZ3
ERR2208944	0.000000	0.000000	859.203620	73.019466	11.942279	1.947147	86.527503	9.236956	3.918524	17.763974	...	16.725820	14.012390	7.255890	6.984732	16.469669	0.879873	21.106355	89.920944	47.520979	21.534480
ERR2208928	0.038627	0.032171	881.830260	7.533515	12.650118	2.778540	95.158662	10.227978	1.798357	10.818061	...	34.613126	42.500215	12.806729	12.317719	18.357604	0.526526	35.163162	60.709750	52.413439	25.859226
ERR2208949	0.030213	0.012581	504.984491	50.836895	5.900676	0.611231	106.174319	8.090318	4.960132	28.557439	...	12.677251	19.670726	12.836934	9.511444	9.528438	0.154435	23.424874	69.710920	40.638326	25.926391
ERR2208900	0.143579	0.023916	1940.416805	0.182940	4.014771	0.813332	46.225429	2.235265	0.042219	40.342963	...	24.998045	21.527292	8.889713	7.719297	12.813737	0.670318	18.452489	56.563242	36.542147	32.326600
ERR2208922	0.109992	0.286277	1534.682495	0.860539	11.887174	0.061813	83.537696	5.875654	1.482365	16.994521	...	34.388006	26.369286	3.395453	5.485167	6.931863	0.093706	19.792547	38.539763	25.926938	27.002840

Compass Input Requirement¶

Data Processing Recommendation¶

1. mRNA-Seq Alignment Workflow ¶

Specific Process¶

2. Converting mRNA Raw Counts to TPM ¶

Step 1: Download the human GENCODE annotation file (v36)¶

step2: using rnanorm tool to convert Count to TPM¶

step3: Now let's test on an example file¶

3. Preparing the inputs for the Compass¶

Step1. Add the cancer type information¶

Step2. Now lets map the df_counts to Compass's input genes.¶

Compass Input Requirement¶

Data Processing Recommendation¶

1. mRNA-Seq Alignment Workflow¶

Specific Process¶

2. Converting mRNA Raw Counts to TPM¶

Step 1: Download the human GENCODE annotation file (v36)¶

step2: using rnanorm tool to convert Count to TPM¶

step3: Now let's test on an example file¶

3. Preparing the inputs for the Compass¶

Step1. Add the cancer type information¶

Step2. Now lets map the df_counts to Compass's input genes.¶

1. mRNA-Seq Alignment Workflow ¶

2. Converting mRNA Raw Counts to TPM ¶