prepare reference file

December 6, 2022 ยท View on GitHub

    1. Genome build version. e.g. hg38
    1. cutting site of the specified restriction enzyme (Note, we use 500bp windows for micro-C "frag", and 5kb as "anchor", generated by bedtools makewindows)
    1. path to the scripts, <HiCorr_dir>/bin/generateReference_lib/
wget https://hgdownload.cse.ucsc.edu/goldenpath/hg38/bigZips/hg38.chrom.sizes
wget https://hgdownload.cse.ucsc.edu/goldenpath/hg38/bigZips/hg38.chromFa.tar.gz
wget http://mitra.stanford.edu/kundaje/akundaje/release/blacklists/hg38-human/hg38.blacklist.bed.gz
gunzip hg38.blacklist.bed.gz
tar -xvf hg38.chromFa.tar.gz
for file in `ls chroms/ | grep _`;do
        rm chroms/$file
done

genome=hg38
genome_fa_dir=./chroms/
genome_chrom_size=./hg38.chrom.sizes
# for example: DpnII-->GATC; HindIII-->AAGCTT. This would depend on which enzyme you use for Hi-C experiment. 
cutsite=GATC
enzyme=DPNII
blackregion=./hg38.blacklist.bed
lib=HiCorr/bin/generateReference_lib/
output=./${genome}_${enzyme}
mkdir $output
chmod +x $lib/*

## start build references #################################################################################
# 1. generate fragment bed file
$lib/find_RE_sites.pl $genome_fa_dir $genome_chrom_size $cutsite $lib> $genome.cutting.sites
python3 $lib/sites_to_frag.py $genome_chrom_size $genome.cutting.sites | awk '{print \$0,\$3-\$2+1}' OFS='\t' >  $genome.$enzyme.frag.bed

# 2. generate ~5kb anchor (Here we average each fragment into ~5kb anchor)
python3 $lib/generate.fragment.py $genome.$enzyme.frag.bed 5000 > ${genome}_${enzyme}_frag_2_anchor  
python3 $lib/get_aveg_frag_length.py ${genome}_${enzyme}_frag_2_anchor anchor.bed > ${genome}_${enzyme}_anchors_avg.bed 
mkdir $genome.anchor.5kb
cat ${genome}_${enzyme}_anchors_avg.bed | cut -f1-4 | awk '{print>"'$genome'"".anchor.5kb/"\$1".bed"}'

# 3. divided all anchor based on their length into 20 equal size groups 
$lib/get_group_range.pl ${genome}_${enzyme}_anchors_avg.bed 6 20 > ${genome}_anchor_length.groups

# 4. generate all possible trans contact matric 
$lib/count_trans_pairs_by_GC.pl ${genome}_${enzyme}_anchors_avg.bed ${genome}_${enzyme}_anchors_avg.bed ${genome}_anchor_length.groups > $genome.trans.possible.pairs

# 5. generate anchor to blacklist file by overlaping 5kb anchor and blacklist region
bedtools intersect -wa -a ${genome}_${enzyme}_anchors_avg.bed -b $blackregion | cut -f1-4 | sort -u > ${genome}_5kb_anchors_blacklist

# 6. generate all possible pairs within 2Mb (blacklist has been removed) 
$lib/list_full_matrix.pl ${genome}_${enzyme}_anchors_avg.bed 2000000 | python3 $lib/remove.blacklist.py ${genome}_5kb_anchors_blacklist > ${genome}.full.matrix
# 7. Set up 400 groups for distance within 0-2Mb (5kb per group)
### "$genome.dist.5kb.group" contains distance groups (401 groups for 200,0000)
# 1       -1      1000
# 2       1001    5000
# 3       5001    10000
# 4       10001   15000
# ...
# 400     1990001 1995000
#401     1995001 2e+06
cp HiCorr/bin/dist.401.group ${genome}.dist.401.group
# 8. To create length and distance statistical file for full matrix 
$lib/get_group_statistics.pl ${genome}.full.matrix ${genome}_${enzyme}_anchors_avg.bed ${genome}_anchor_length.groups $genome.dist.5kb.group | awk '{print \$0,0}' OFS='\t' > $genome.full.dist.stat.5kb