2022-11-29

How to print all columns and prevent an insertion of one when using lapply()?

I have a file with 28 columns and numerous rows >10000. I am splitting this file by the second column called gene_id, so that there are numerous outputs each file with a distinct gene_id.

variant_id gene_id tss_distance ma_samples ma_count maf pval_nominal slope slope_se hg38_chr hg38_pos ref_allele alt_allele hg19_chr hg19_pos ID new_MAF CHROM POS REF ALT A1 OBS_CT BETA SE P SD Variance
chr1_17726150_G_A_b38 ENSG00000272426.1 821374 68 78 0.0644628 0.764314 -0.0320846 0.106958 chr1 17726150 G A chr1 18052645 rs260514:18052645:G:A 0.058155 1 18052645 G A G 1597 0.0147047 0.0656528 0.822804 2.62364886486368 6.88353336610048
chr1_17729225_G_A_b38 ENSG00000117118.9 675055 205 226 0.186777 0.770706 0.00898192 0.0308023 chr1 17729225 G A chr1 18055720 rs11580304:18055720:G:A 0.194694 1 18055720 G A A 1597 0.00515331 0.022282 0.817129 0.890444032956592 0.792890575828
chr1_17729225_G_A_b38 ENSG00000117122.13 748390 205 226 0.186777 0.0373499 0.0553745 0.0265315 chr1 17729225 G A chr1 18055720 rs11580304:18055720:G:A 0.194694 1 18055720 G A A 1597 0.00515331 0.022282 0.817129 0.890444032956592 0.792890575828
chr1_177298830_G_A_b38 ENSG00000117122.13 7483450 245 246 0.106777 0.0377699 0.009745 0.0265315 chr1 17729225 G A chr1 18055720 rs11580304:18055720:G:A 0.194694 1 18055720 G A A 1597 0.00515331 0.022282 0.817129 0.890449757 0.79289055858
output 1
chr1_17726150_G_A_b38 ENSG00000272426.1 821374 68 78 0.0644628 0.764314 -0.0320846 0.106958 chr1 17726150 G A chr1 18052645 rs260514:18052645:G:A 0.058155 1 18052645 G A G 1597 0.0147047 0.0656528 0.822804 2.62364886486368 6.88353336610048

output 2
chr1_17729225_G_A_b38 ENSG00000117118.9 675055 205 226 0.186777 0.770706 0.00898192 0.0308023 chr1 17729225 G A chr1 18055720 rs11580304:18055720:G:A 0.194694 1 18055720 G A A 1597 0.00515331 0.022282 0.817129 0.890444032956592 0.792890575828

output 3
chr1_17729225_G_A_b38 ENSG00000117122.13 748390 205 226 0.186777 0.0373499 0.0553745 0.0265315 chr1 17729225 G A chr1 18055720 rs11580304:18055720:G:A 0.194694 1 18055720 G A A 1597 0.00515331 0.022282 0.817129 0.890444032956592 0.792890575828
chr1_17729883_G_A_b38 ENSG00000117122.13 7483450 245 246 0.106777 0.0377699 0.009745 0.0265315 chr1 17729225 G A chr1 18055720 rs11580304:18055720:G:A 0.194694 1 18055720 G A A 1597 0.00515331 0.022282 0.817129 0.890449757 0.79289055858

I am using the r script below:

df <- read.table("/data/coloc_eQTL/combined_GWAS_skin_eQTL_AL.txt", header = TRUE)

mylist <- split(df , f = df$gene_id)

lapply(names(mylist), function(x) write.table(mylist[[x]], file=paste(x,".txt"), sep="\t", row.names=FALSE, quote=FALSE))

However, I notice in the output file there is a column inserted at the start with numbers even though I have stated rownames = FALSE. Also, the output does not show the P column. Therefore, everything is misaligned. How can I ensure all columns are retained and an additional column is not added at the start?

number  variant_id  gene_id tss_distance    ma_samples  ma_count    maf pval_nominal    slope   slope_se    hg38_chr    hg38_pos    ref_allele  alt_allele  hg19_chr    hg19_pos    ID  new_MAF CHROM   POS REF ALT A1  OBS_CT  BETA    SE  SD  Variance
6253451 chr1_17726150_G_A_b38   ENSG00000074964.16  186315  68  78  0.0644628   0.966721    0.00151619  0.0363244   chr1    17726150    G   A   chr1    18052645    rs260514:18052645:G:A   0.058155    1   18052645    G   A   G   1597    0.0147047   0.0656528   0.822804    2.62364886486368    6.88353336610048

dput the original file (10 rows).

structure(list(variant_id = c("chr1_17726150_G_A_b38", "chr1_17726150_G_A_b38", 
"chr1_17726150_G_A_b38", "chr1_17726150_G_A_b38", "chr1_17726150_G_A_b38", 
"chr1_17726150_G_A_b38", "chr1_17726150_G_A_b38", "chr1_17726150_G_A_b38", 
"chr1_17726150_G_A_b38", "chr1_17726150_G_A_b38"), gene_id = c("ENSG00000272426.1", 
"ENSG00000117118.9", "ENSG00000142623.9", "ENSG00000142619.4", 
"ENSG00000179023.8", "ENSG00000228549.3", "ENSG00000058453.16", 
"ENSG00000159339.13", "ENSG00000074964.16", "ENSG00000117122.13"
), tss_distance = c(821374L, 671980L, 521024L, 477052L, -754832L, 
855205L, 804200L, 417955L, 186315L, 745315L), ma_samples = c(68L, 
68L, 68L, 68L, 68L, 68L, 68L, 68L, 68L, 68L), ma_count = c(78L, 
78L, 78L, 78L, 78L, 78L, 78L, 78L, 78L, 78L), maf = c(0.0644628, 
0.0644628, 0.0644628, 0.0644628, 0.0644628, 0.0644628, 0.0644628, 
0.0644628, 0.0644628, 0.0644628), pval_nominal = c(0.764314, 
0.955989, 0.352575, 0.00666648, 0.667965, 0.0943182, 0.489115, 
0.796736, 0.966721, 0.326205), slope = c(-0.0320846, -0.00275742, 
-0.0687903, -0.202377, 0.0460589, -0.180725, -0.0449686, 0.0258654, 
0.00151619, -0.0424019), slope_se = c(0.106958, 0.0499406, 0.0739349, 
0.0743021, 0.107318, 0.10783, 0.0649652, 0.10037, 0.0363244, 
0.0431489), hg38_chr = c("chr1", "chr1", "chr1", "chr1", "chr1", 
"chr1", "chr1", "chr1", "chr1", "chr1"), hg38_pos = c(17726150L, 
17726150L, 17726150L, 17726150L, 17726150L, 17726150L, 17726150L, 
17726150L, 17726150L, 17726150L), ref_allele = c("G", "G", "G", 
"G", "G", "G", "G", "G", "G", "G"), alt_allele = c("A", "A", 
"A", "A", "A", "A", "A", "A", "A", "A"), hg19_chr = c("chr1", 
"chr1", "chr1", "chr1", "chr1", "chr1", "chr1", "chr1", "chr1", 
"chr1"), hg19_pos = c(18052645L, 18052645L, 18052645L, 18052645L, 
18052645L, 18052645L, 18052645L, 18052645L, 18052645L, 18052645L
), ID = c("rs260514:18052645:G:A", "rs260514:18052645:G:A", "rs260514:18052645:G:A", 
"rs260514:18052645:G:A", "rs260514:18052645:G:A", "rs260514:18052645:G:A", 
"rs260514:18052645:G:A", "rs260514:18052645:G:A", "rs260514:18052645:G:A", 
"rs260514:18052645:G:A"), new_MAF = c(0.058155, 0.058155, 0.058155, 
0.058155, 0.058155, 0.058155, 0.058155, 0.058155, 0.058155, 0.058155
), CHROM = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), POS = c(18052645L, 
18052645L, 18052645L, 18052645L, 18052645L, 18052645L, 18052645L, 
18052645L, 18052645L, 18052645L), REF = c("G", "G", "G", "G", 
"G", "G", "G", "G", "G", "G"), ALT = c("A", "A", "A", "A", "A", 
"A", "A", "A", "A", "A"), A1 = c("G", "G", "G", "G", "G", "G", 
"G", "G", "G", "G"), OBS_CT = c(1597L, 1597L, 1597L, 1597L, 1597L, 
1597L, 1597L, 1597L, 1597L, 1597L), BETA = c(0.0147047, 0.0147047, 
0.0147047, 0.0147047, 0.0147047, 0.0147047, 0.0147047, 0.0147047, 
0.0147047, 0.0147047), SE = c(0.0656528, 0.0656528, 0.0656528, 
0.0656528, 0.0656528, 0.0656528, 0.0656528, 0.0656528, 0.0656528, 
0.0656528), P = c(0.822804, 0.822804, 0.822804, 0.822804, 0.822804, 
0.822804, 0.822804, 0.822804, 0.822804, 0.822804), SD = c(2.62364886486368, 
2.62364886486368, 2.62364886486368, 2.62364886486368, 2.62364886486368, 
2.62364886486368, 2.62364886486368, 2.62364886486368, 2.62364886486368, 
2.62364886486368), Variance = c(6.88353336610048, 6.88353336610048, 
6.88353336610048, 6.88353336610048, 6.88353336610048, 6.88353336610048, 
6.88353336610048, 6.88353336610048, 6.88353336610048, 6.88353336610048
)), row.names = c(NA, 10L), class = "data.frame")

dput the results of mylist (the first 2 rows).

list(ENSG00000058453.16 = structure(list(variant_id = c("chr1_17726150_G_A_b38", 
"chr1_17728143_GC_G_b38", "chr1_17728290_G_A_b38", "chr1_17729225_G_A_b38", 
"chr1_17729967_C_T_b38", "chr1_17731217_C_T_b38"), gene_id = c("ENSG00000058453.16", 
"ENSG00000058453.16", "ENSG00000058453.16", "ENSG00000058453.16", 
"ENSG00000058453.16", "ENSG00000058453.16"), tss_distance = c(804200L, 
806193L, 806340L, 807275L, 808017L, 809267L), ma_samples = c(68L, 
395L, 167L, 205L, 233L, 233L), ma_count = c(78L, 486L, 183L, 
226L, 262L, 263L), maf = c(0.0644628, 0.401653, 0.15124, 0.186777, 
0.216529, 0.217355), pval_nominal = c(0.489115, 0.210837, 0.820243, 
0.301818, 0.137132, 0.128855), slope = c(-0.0449686, 0.0404518, 
0.0097847, 0.0413934, 0.0574705, 0.0585334), slope_se = c(0.0649652, 
0.0322899, 0.0430392, 0.0400502, 0.0386021, 0.038484), hg38_chr = c("chr1", 
"chr1", "chr1", "chr1", "chr1", "chr1"), hg38_pos = c(17726150L, 
17728143L, 17728290L, 17729225L, 17729967L, 17731217L), ref_allele = c("G", 
"GC", "G", "G", "C", "C"), alt_allele = c("A", "G", "A", "A", 
"T", "T"), hg19_chr = c("chr1", "chr1", "chr1", "chr1", "chr1", 
"chr1"), hg19_pos = c(18052645L, 18054638L, 18054785L, 18055720L, 
18056462L, 18057712L), ID = c("rs260514:18052645:G:A", "rs35592535:18054638:GC:G", 
"rs1572792:18054785:G:A", "rs11580304:18055720:G:A", "rs1890743:18056462:C:T", 
"rs7546135:18057712:C:T"), new_MAF = c(0.058155, 0.371673, 0.17466, 
0.194694, 0.197464, 0.198691), CHROM = c(1L, 1L, 1L, 1L, 1L, 
1L), POS = c(18052645L, 18054638L, 18054785L, 18055720L, 18056462L, 
18057712L), REF = c("G", "GC", "G", "G", "C", "C"), ALT = c("A", 
"G", "A", "A", "T", "T"), A1 = c("G", "G", "A", "A", "T", "T"
), OBS_CT = c(1597L, 1597L, 1597L, 1597L, 1597L, 1597L), BETA = c(0.0147047, 
0.0138673, -0.0126002, 0.00515331, 0.00415908, 0.00597402), SE = c(0.0656528, 
0.0269643, 0.0256229, 0.022282, 0.0220529, 0.0217018), P = c(0.822804, 
0.607124, 0.622959, 0.817129, 0.850434, 0.783139), SD = c(2.62364886486368, 
1.07756036432328, 1.02395469042471, 0.890444032956592, 0.88128862823752, 
0.867257800664993), Variance = c(6.88353336610048, 1.16113633876053, 
1.04848320804277, 0.792890575828, 0.77666964626077, 0.75213609281428
)), row.names = c(7L, 25L, 45L, 56L, 77L, 94L), class = "data.frame"), 
    ENSG00000074964.16 = structure(list(variant_id = c("chr1_17726150_G_A_b38", 
    "chr1_17728143_GC_G_b38", "chr1_17728290_G_A_b38", "chr1_17729225_G_A_b38", 
    "chr1_17729967_C_T_b38", "chr1_17731217_C_T_b38"), gene_id = c("ENSG00000074964.16", 
    "ENSG00000074964.16", "ENSG00000074964.16", "ENSG00000074964.16", 
    "ENSG00000074964.16", "ENSG00000074964.16"), tss_distance = c(186315L, 
    188308L, 188455L, 189390L, 190132L, 191382L), ma_samples = c(68L, 
    395L, 167L, 205L, 233L, 233L), ma_count = c(78L, 486L, 183L, 
    226L, 262L, 263L), maf = c(0.0644628, 0.401653, 0.15124, 
    0.186777, 0.216529, 0.217355), pval_nominal = c(0.966721, 
    0.954589, 0.17366, 0.865996, 0.547435, 0.565949), slope = c(0.00151619, 
    0.00102964, -0.0327149, -0.00378263, -0.01301, -0.0123769
    ), slope_se = c(0.0363244, 0.0180728, 0.0240136, 0.0224053, 
    0.0216116, 0.021548), hg38_chr = c("chr1", "chr1", "chr1", 
    "chr1", "chr1", "chr1"), hg38_pos = c(17726150L, 17728143L, 
    17728290L, 17729225L, 17729967L, 17731217L), ref_allele = c("G", 
    "GC", "G", "G", "C", "C"), alt_allele = c("A", "G", "A", 
    "A", "T", "T"), hg19_chr = c("chr1", "chr1", "chr1", "chr1", 
    "chr1", "chr1"), hg19_pos = c(18052645L, 18054638L, 18054785L, 
    18055720L, 18056462L, 18057712L), ID = c("rs260514:18052645:G:A", 
    "rs35592535:18054638:GC:G", "rs1572792:18054785:G:A", "rs11580304:18055720:G:A", 
    "rs1890743:18056462:C:T", "rs7546135:18057712:C:T"), new_MAF = c(0.058155, 
    0.371673, 0.17466, 0.194694, 0.197464, 0.198691), CHROM = c(1L, 
    1L, 1L, 1L, 1L, 1L), POS = c(18052645L, 18054638L, 18054785L, 
    18055720L, 18056462L, 18057712L), REF = c("G", "GC", "G", 
    "G", "C", "C"), ALT = c("A", "G", "A", "A", "T", "T"), A1 = c("G", 
    "G", "A", "A", "T", "T"), OBS_CT = c(1597L, 1597L, 1597L, 
    1597L, 1597L, 1597L), BETA = c(0.0147047, 0.0138673, -0.0126002, 
    0.00515331, 0.00415908, 0.00597402), SE = c(0.0656528, 0.0269643, 
    0.0256229, 0.022282, 0.0220529, 0.0217018), P = c(0.822804, 
    0.607124, 0.622959, 0.817129, 0.850434, 0.783139), SD = c(2.62364886486368, 
    1.07756036432328, 1.02395469042471, 0.890444032956592, 0.88128862823752, 
    0.867257800664993), Variance = c(6.88353336610048, 1.16113633876053, 
    1.04848320804277, 0.792890575828, 0.77666964626077, 0.75213609281428
    )), row.names = c(9L, 23L, 38L, 66L, 76L, 97L), class = "data.frame"))


No comments:

Post a Comment