prepare reference file

December 6, 2022 · View on GitHub

1. Genome build version. e.g. hg38
- download chrom.sizes:
- donwload fa directory:
- download blacklist region: https://github.com/Boyle-Lab/
1. cutting site of the specified restriction enzyme (Note, we use 500bp windows for micro-C "frag", and 5kb as "anchor", generated by bedtools makewindows)
1. path to the scripts, <HiCorr_dir>/bin/generateReference_lib/

wget https://hgdownload.cse.ucsc.edu/goldenpath/hg38/bigZips/hg38.chrom.sizes
wget https://hgdownload.cse.ucsc.edu/goldenpath/hg38/bigZips/hg38.chromFa.tar.gz
wget http://mitra.stanford.edu/kundaje/akundaje/release/blacklists/hg38-human/hg38.blacklist.bed.gz
gunzip hg38.blacklist.bed.gz
tar -xvf hg38.chromFa.tar.gz
for file in `ls chroms/ | grep _`;do
        rm chroms/$file
done

genome=hg38
genome_fa_dir=./chroms/
genome_chrom_size=./hg38.chrom.sizes
# for example: DpnII-->GATC; HindIII-->AAGCTT. This would depend on which enzyme you use for Hi-C experiment. 
cutsite=GATC
enzyme=DPNII
blackregion=./hg38.blacklist.bed
lib=HiCorr/bin/generateReference_lib/
output=./${genome}_${enzyme}
mkdir $output
chmod +x $lib/*

## start build references #################################################################################
# 1. generate fragment bed file
$lib/find_RE_sites.pl $genome_fa_dir $genome_chrom_size $cutsite $lib> $genome.cutting.sites
python3 $lib/sites_to_frag.py $genome_chrom_size $genome.cutting.sites | awk '{print \$0,\$3-\$2+1}' OFS='\t' >  $genome.$enzyme.frag.bed

# 2. generate ~5kb anchor (Here we average each fragment into ~5kb anchor)
python3 $lib/generate.fragment.py $genome.$enzyme.frag.bed 5000 > ${genome}_${enzyme}_frag_2_anchor  
python3 $lib/get_aveg_frag_length.py ${genome}_${enzyme}_frag_2_anchor anchor.bed > ${genome}_${enzyme}_anchors_avg.bed 
mkdir $genome.anchor.5kb
cat ${genome}_${enzyme}_anchors_avg.bed | cut -f1-4 | awk '{print>"'$genome'"".anchor.5kb/"\$1".bed"}'

# 3. divided all anchor based on their length into 20 equal size groups 
$lib/get_group_range.pl ${genome}_${enzyme}_anchors_avg.bed 6 20 > ${genome}_anchor_length.groups

# 4. generate all possible trans contact matric 
$lib/count_trans_pairs_by_GC.pl ${genome}_${enzyme}_anchors_avg.bed ${genome}_${enzyme}_anchors_avg.bed ${genome}_anchor_length.groups > $genome.trans.possible.pairs

# 5. generate anchor to blacklist file by overlaping 5kb anchor and blacklist region
bedtools intersect -wa -a ${genome}_${enzyme}_anchors_avg.bed -b $blackregion | cut -f1-4 | sort -u > ${genome}_5kb_anchors_blacklist

# 6. generate all possible pairs within 2Mb (blacklist has been removed) 
$lib/list_full_matrix.pl ${genome}_${enzyme}_anchors_avg.bed 2000000 | python3 $lib/remove.blacklist.py ${genome}_5kb_anchors_blacklist > ${genome}.full.matrix
# 7. Set up 400 groups for distance within 0-2Mb (5kb per group)
### "$genome.dist.5kb.group" contains distance groups (401 groups for 200,0000)
# 1       -1      1000
# 2       1001    5000
# 3       5001    10000
# 4       10001   15000
# ...
# 400     1990001 1995000
#401     1995001 2e+06
cp HiCorr/bin/dist.401.group ${genome}.dist.401.group
# 8. To create length and distance statistical file for full matrix 
$lib/get_group_statistics.pl ${genome}.full.matrix ${genome}_${enzyme}_anchors_avg.bed ${genome}_anchor_length.groups $genome.dist.5kb.group | awk '{print \$0,0}' OFS='\t' > $genome.full.dist.stat.5kb