This step is based on the RNA object generated from the filtered expression matrix using Seurat R package, along with a 6-bp barcode list created during library preparation to match RNA and DNA.
# R
library(Seurat)
data<- Read10X(data.dir = "1_RNA_preprocess/GeneFull/filtered")
rna<-CreateSeuratObject(counts = data, project = "example", min.cells = 3, min.features = 200)# R
df <- data.frame(RNAlibrary = rna$orig.ident)
df$DNAbarcode1<-substr(rownames(df),1,6)
df$DNAbarcode2<-substr(rownames(df),7,12)
df$DNAbarcode3<-substr(rownames(df),13,18)# R
barcode_map <- c( #predefined 6-bp RNA–DNA barcode pairing scheme used during library preparation
"TCATCC" = "TACCCG", #left represents RNA and right represents DNA
"AGTCAA" = "GAGTTT",
...
"ATGTCA" = "CGATTT",
"CCCTAC" = "CTGGAG"
)
df$DNAbarcode1 <- sapply(df$DNAbarcode1, function(barcode) {
for (old_prefix in names(barcode_map)) {
if (startsWith(barcode, old_prefix)) {
return(sub(paste0("^", old_prefix), barcode_map[old_prefix], barcode))
}
}
return(barcode)
})# R
reverse_complement <- function(seq) {
complement <- setNames(c("T", "G", "C", "A"), c("A", "C", "G", "T"))
comp_seq <- unname(complement[unlist(strsplit(seq, ""))])
rev_seq <- paste(rev(comp_seq), collapse = "")
return(rev_seq)
}
df$DNAbarcode3<-unname(sapply(df$DNAbarcode3, reverse_complement))
df$DNAbarcode<-paste0(df$DNAbarcode1,df$DNAbarcode2,df$DNAbarcode3) # combine full DNA barcode
df$DNAbarcode1<-NULL
df$DNAbarcode2<-NULL
df$DNAbarcode3<-NULL1.5 filter cells based on the high-quliaty barcode list generate from 3_ATAC_fragment and 4_chromatin_contact
# R
atac_cut_rank<-read.table("3_ATAC_fragment/02_fragment/*.barcode.cut_rank")$V1
pairs_cut_rank<-read.table("4_chromatin_contact/03_dedup/*.barcode.cut_rank")$V1
valid_DNA_barcodes <- intersect(atac_cut_rank, pairs_cut_rank)
df2 <- df[df$DNAbarcode %in% valid_DNA_barcodes, ]
# Note: If you also want to filter cells based on TSS enrichment, please run the ArchR functions (`createArrowFiles` and `ArchRProject` steps in the "3. Single-cell processing") first and export the processed barcode list for additional filtering here.# R
rna_filter <- subset(rna, cells = rownames(df2))
rna_filter <- AddMetaData(rna_filter, metadata = setNames(df2$DNAbarcode, rownames(df2)), col.name = "DNAbarcode")write.table([email protected],"example_metadata.txt",quote=F,sep='\t',col.names=T,row.names=T) outputs the cell type and DNA barcode information after cell clustering and annotation with Seurat. The metadata table will be used for downstream pseudo-bulk and single-cell analysis of the DNA library.
Before running the code below, make sure to extract DNA barcodes for each cell type from "example_metadata.txt" and save each as a single-column file (*.DNAbarcode.txt).
# Bash
for i in {celltype1,celltype2,celltype3,...,celltypeN}
do
python3 extract_ATAC_fragment.py -l 3_ATAC_fragment/03_filtered/*.filtered.tsv -s ${i}.DNAbarcode.txt -o ${i}.ATAC.fragment.tsv
python3 extract_pairs.py -l 4_chromatin_contact/05_filtered/*.dedup.filtered.pairs -s ${i}.DNAbarcode.txt -o ${i}.contact.pairs
donemacs2: install
# Bash
genome_size=mm # or hs
for i in {celltype1,celltype2,celltype3,...,celltypeN}
do
macs2 callpeak --shift -75 --extsize 150 --nomodel -B --SPMR --keep-dup all --call-summits --qval 0.01 --gsize $genome_size --format BED --name ${i} --treatment ${i}.ATAC.fragment.tsv
doneslopBed: install
bedClip and bedGraphToBigWig: install
# Bash
chrom_size=The/PATH/of/chrom/sizes/file
for i in {celltype1,celltype2,celltype3,...,celltypeN}
do
sval=$(wc -l ${i}.ATAC.fragment.tsv | awk '{printf "%f", $1/1000000}') #counts the number of tags per million in the (compressed) BED file
macs2 bdgcmp -t ${i}_treat_pileup.bdg -c ${i}_control_lambda.bdg --o-prefix ${i} -m ppois -S $sval
slopBed -i ${i}_ppois.bdg -g $chrom_size -b 0 | bedClip stdin $chrom_size ${i}.pval.signal.bedgraph
grep -E 'chrX|chrY' ${i}.pval.signal.bedgraph > temp
grep -E -v 'chrX|chrY' ${i}.pval.signal.bedgraph|sort -k1.4n -k 2,2n|cat - temp > ${i}.pval.signal.srt.bedgraph
bedGraphToBigWig ${i}.pval.signal.srt.bedgraph $chrom_size ${i}_sig.pval.signal.bigwig
rm -f ${i}.pval.signal.bedgraph ${i}.pval.signal.srt.bedgraph
rm temp
donecooler: install
# Bash
chrom_size=The/PATH/of/chrom/sizes/file
for j in {celltype1,celltype2,celltype3,...,celltypeN}
do
cat 4_chromatin_contact/05_filtered/*.dedup.pairs.head ${j}.contact.pairs|bgzip -c > ${j}.contact.pairs.gz
pairtools sort -o ${j}.sort.pairs.gz ${j}.contact.pairs.gz
pairix -f ${j}.sort.pairs.gz
cooler cload pairix --max-split 2 --nproc 12 ${chrom_size}:5000 ${j}.sort.pairs.gz ${j}.5000.cool
cooler zoomify --balance -r 5000N -n 12 -o ${j}.5000.mcool ${j}.5000.coolThe *.cool file can be used to call A/B compartment, TAD, and chromatin loops for each cell types. The *.mcool file can be used to visualization with Higlass.
The 03_filtered/*.filtered.tsv.gz files can be used for TSS enrichment score and gene score calculation, as well as cell clustering, using the ArchR R package.
# R
library(ArchR)
addArchRThreads(threads = 1)
addArchRGenome("mm10")
createArrowFiles(inputFiles = "03_filtered/*.filtered.tsv.gz", sampleNames = "example", minTSS = 1, minFrags = 1000, addTileMat = TRUE, addGeneScoreMat = TRUE) # Set a small cutoff for minTSS and minFrags if you just want to retain the same cells as in [email protected].
dna <- ArchRProject(ArrowFiles = "example.arrow",outputDirectory = "obj.name", copyArrows = TRUE)
rna_metadata<-read.table("example_metadata.txt",header=T,sep='\t',row.names=1)
dna_filter<-dna[paste("obj.name#",rna_metadata$DNAbarcode,sep=""), ] #filter cells based on the cell barcodes of [email protected]
GSmatrix <- getMatrixFromProject(dna_filter,"GeneScoreMatrix")
rownames(GSmatrix)<-rowData(GSmatrix)$name
genescore<-assays(GSmatrix)$GeneScoreMatrix
colnames(genescore)<-gsub("#","_",colnames(genescore))
saveRDS(genescore,"genescore_matrix.rds")
dna_filter <- addIterativeLSI(ArchRProj = dna_filter,useMatrix = "TileMatrix",name = "IterativeLSI",iterations = 4,clusterParams = list(resolution = c(0.2,0.6,1.2),sampleCells = 20000, n.start = 10), varFeatures = 100000,dimsToUse = 1:50,outlierQuantiles = NULL)
dna_filter <- addClusters(input = dna_filter,reducedDims = "IterativeLSI",method = "Seurat",name = "Clusters",resolution = 0.8)
dna_filter <- addUMAP(ArchRProj = dna_filter, reducedDims = "IterativeLSI", name = "UMAP", minDist = 0.2)
plotEmbedding(ArchRProj = dna_filter, colorBy = "cellColData", name = "Clusters", embedding = "UMAP")
saveRDS(dna_filter,"dna_filter.rds")Generate a list of contact pair files, each representing a single cell in tab-delimited format:
# Bash
python3 extract_pairs.py -l 4_chromatin_contact/05_filtered/*.dedup.filtered.pairs -s all_valid_DNA_barcodes.txt -o all_valid_DNA_contact.pairs
cut -d':' -f1,3 all_valid_DNA_contact.pairs | perl -p -e "s/://g" | cut -f1-5 > all_valid_DNA_contact.pairs.5columns
python3 split_per_cell_pairs.py --input_file all_valid_DNA_contact.pairs.5columns --output_dir per_cell/The per_cell folder can be used for cell embedding and imputing single-cell chromatin contact maps with Fast-Higashi and Higashi.
# Python
### Process the input data:
from higashi.Higashi_wrapper import *
config = ./higashi.JSON"
higashi_model = Higashi(config)
higashi_model.process_data()
### Initialize Fast-Higashi model and turn sparse matrices into sparse tensors:
from fasthigashi.FastHigashi_Wrapper import *
from umap import UMAP
import seaborn as sns
import matplotlib.pyplot as plt
config = "./Fasthigashi.JSON"
fastHigashi_model = FastHigashi(config_path=config,
path2input_cache="./Fasthigashi",
path2result_dir="./Fasthigashi",
off_diag=100,
filter=False,
do_conv=False,
do_rwr=False,
do_col=False,
no_col=False)
fastHigashi_model.prep_dataset(batch_norm=True)
### Run the model:
fastHigashi_model.run_model(dim1=.6,
rank=256,
n_iter_parafac=1,
extra="")
### Visualize the embedding results:
clusters = fastHigashi_model.label_info['category']
pal1 = {"GLUT":"#ff595e", "GABA":"#1982c4", "NON":"#8ac926"}
embed = fastHigashi_model.fetch_cell_embedding(final_dim=256,restore_order=False)
embedding = embed['embed_l2_norm_correct_coverage_fh']
vec = UMAP(n_components=2, n_neighbors=25, random_state=0).fit_transform(embedding)
fig = plt.figure(figsize=(5, 3))
ax = plt.subplot(1, 1, 1)
sns.scatterplot(x=vec[:, 0], y=vec[:, 1], hue=clusters, ax=ax, s=5, alpha=0.8, linewidth=0, palette=pal1)
handles, labels = ax.get_legend_handles_labels()
labels, handles = zip(*sorted(zip(labels, handles), key=lambda t: t[0]))
ax.legend(handles=handles, labels=labels, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., ncol=1)
plt.tight_layout()
plt.savefig("example_fastHigashi_UMAP.pdf", format="pdf")
### train higashi for contact map imputation:
from higashi.Higashi_wrapper import *
config = "./higashi.JSON"
higashi_model = Higashi(config)
higashi_model.prep_model()
higashi_model.train_for_imputation_nbr_0()
higashi_model.impute_no_nbr()
higashi_model.train_for_imputation_with_nbr()
higashi_model.impute_with_nbr()
with open('higashi_model.impute.pkl', 'wb') as f:
pickle.dump(higashi_model, f)