BCR Sequence Motif Analysis

hendra s ismanto
2 min readOct 24, 2020

SRA file BCR full imputed extraction

Dataset download

Shell script

Example: (don’t directly copy this to bash script)

prefetch SRR10007731fastq-dump SRR10007731.sra
fastq-mcf n/a SRR10007731.fastq -o NORMAL_82.fastq
mixcr align --species hsa -p kAligner2 --report NORMAL_82_align.txt NORMAL_82.fastq NORMAL_82_align.vdjca
rm NORMAL_82.fastq
mixcr assemble --write-alignments --report NORMAL_82_assemble.txt NORMAL_82_align.vdjca NORMAL_82_clones.clna
mixcr assembleContigs --report NORMAL_82_contigs.txt NORMAL_82_clones.clna NORMAL_82_full_clones.clns
mixcr exportClones -c IG -p fullImputed NORMAL_82_full_clones.clns NORMAL_82_full_clones.txt

K-mer analysis

all done in python

Not multiplied by clonecount

import pandas as pd
import itertools
d1 = pd.read_csv(‘BRCA_18_full_clones.txt’, delimiter = ‘\t’)def k_mer_count(k_mer_num, dataname, column):
aa = [‘A’, ‘R’, ’N’, ‘D’, ‘C’, ‘E’, ‘Q’, ‘G’, ‘H’, ‘I’, ‘L’, ‘K’, ‘M’, ‘F’, ‘P’, ‘S’, ‘T’, ‘W’, ‘Y’, ‘V’]
k = k_mer_num
k_mer = [‘’.join(p) for p in itertools.product(aa, repeat=k)]

mer_count = []
for n in k_mer:
hit = dataname[column].str.count(n).sum()
entry = n, hit
mer_count.append(entry)

df = pd.DataFrame(mer_count, columns = [‘Kmer’, ‘count’])
return df
print(‘analyzing 2-mer…’)
dl1_2 = k_mer_count(2, d1, ‘aaSeqImputedCDR3’)
dl1_2.to_csv(‘BRCA_18_CDR3_2-mer_all.csv’)
print(‘finish analyzing 2_1’)
print(‘analyzing 3-mer…’)
dl1_3 = k_mer_count(3, d1, ‘aaSeqImputedCDR3’)
dl1_3.to_csv(‘BRCA_18_CDR3_3-mer_all.csv’)
print(‘finish analyzing 3_1’)
print(‘analyzing 4-mer…’)
dl1_4 = k_mer_count(4, d1, ‘aaSeqImputedCDR3’)
dl1_4.to_csv(‘BRCA_18_CDR3_4-mer_all.csv’)
print(‘finish analyzing 4_1’)
print(‘analyzing 5-mer…’)
dl1_5 = k_mer_count(5, d1, ‘aaSeqImputedCDR3’)
dl1_5.to_csv(‘BRCA_18_CDR3_5-mer_all.csv’)
print(‘finish analyzing 5_1’)

Multiplied by clonecount

import pandas as pd
import itertools
d1 = pd.read_csv(‘BRCA_18_full_clones.txt’, delimiter = ‘\t’)def k_mer_count(k_mer_num, dataname, column):
aa = [‘A’, ‘R’, ’N’, ‘D’, ‘C’, ‘E’, ‘Q’, ‘G’, ‘H’, ‘I’, ‘L’, ‘K’, ‘M’, ‘F’, ‘P’, ‘S’, ‘T’, ‘W’, ‘Y’, ‘V’]
k = k_mer_num
k_mer = [‘’.join(p) for p in itertools.product(aa, repeat=k)]

mer_count = []
for n in k_mer:
hit = dataname[column].str.count(n)
hit2 = hit * dataname[‘cloneCount’]
hit3 = hit2.sum()
entry = n, hit3
mer_count.append(entry)

df = pd.DataFrame(mer_count, columns = [‘Kmer’, ‘count’])
return df
print(‘analyzing 2-mer…’)
dl1_2 = k_mer_count(2, d1, ‘aaSeqImputedCDR3’)
dl1_2.to_csv(‘BRCA_18_CDR3_2-mer_all.csv’)
print(‘finish analyzing 2_1’)
print(‘analyzing 3-mer…’)
dl1_3 = k_mer_count(3, d1, ‘aaSeqImputedCDR3’)
dl1_3.to_csv(‘BRCA_18_CDR3_3-mer_all.csv’)
print(‘finish analyzing 3_1’)
print(‘analyzing 4-mer…’)
dl1_4 = k_mer_count(4, d1, ‘aaSeqImputedCDR3’)
dl1_4.to_csv(‘BRCA_18_CDR3_4-mer_all.csv’)
print(‘finish analyzing 4_1’)
print(‘analyzing 5-mer…’)
dl1_5 = k_mer_count(5, d1, ‘aaSeqImputedCDR3’)
dl1_5.to_csv(‘BRCA_18_CDR3_5-mer_all.csv’)
print(‘finish analyzing 5_1’)

--

--