use strict; # Define variables my @temp=(); my $result1; my $result2; my $result3; my $result4; my $result5; my $result6; my $resultfinal; my $count; my $coun; my $cou; my @digit=(); my $digit; my $marks; my $log; my $coll; my @scorearray=(); my $scorearray; my $percent; my $kount; my @result=(); my $result; my %final=(); my $final; my @c=(); my @matrix1; my @matrix2; my $matrix1; my $matrix2; $coll=0; my $count2; # The reverse complement DNA sequence to analyse should be saved in reverse.txt. Once saved, the program will read it into an array. open(INFILE, "; close(INFILE); chomp (@array5); # Concatenates the input sequence line by line, and then saves it in an output file. my $append = 0; if ($append) { open(MYOUTFILE, ">concatenated2.txt"); #open for write, overwrite } else { open(MYOUTFILE, ">>concatenated2.txt"); #open for write, append } print MYOUTFILE join("", @array5); #write text, no newline my $k = 0; close(MYOUTFILE); # Opens the outfile file containing the concatenated sequence. open(INFILE, "; close(INFILE); # Starting from the first nucleotide, splits the sequence into 19bp segments (length of CTCF site) in a sliding window of 1bp. my @yeslap = $digits =~ /(?=(\w\w\w\w\w\w\w\w\w\w\w\w\w\w\w\w\w\w\w))/g; print @yeslap, "\n"; # Saves the splitted sequences in an output file. my $append = 0; if ($append) { open(MYOUTFILE, ">split2.txt"); #open for write, overwrite } else { open(MYOUTFILE, ">>split2.txt"); #open for write, append } print MYOUTFILE @yeslap, "\n"; close(MYOUTFILE); # Opens the split file. open(INFILE, "; close(INFILE); # Position Frequency matrix of CTCF with equally distributed pseudocount of 0.25 added to each base, A, T, G and C. @matrix1=qw/87.25 291.25 76.25 459.25 167.25 145.25 414.25 187.25 281.25 49.25 449.25 134.25 56.25 800.25 21.25 36.25 8.25 903.25 0.25 2.25 744.25 13.25 65.25 91.25 40.25 528.25 334.25 11.25 107.25 433.25 48.25 324.25 851.25 11.25 32.25 18.25 5.25 0.25 903.25 3.25 333.25 3.25 566.25 9.25 54.25 12.25 504.25 341.25 12.25 0.25 890.25 8.25 56.25 8.25 775.25 71.25 104.25 733.25 5.25 67.25 372.25 13.25 507.25 17.25 82.25 482.25 307.25 37.25 117.25 322.25 73.25 396.25 402.25 181.25 266.25 59.25/; $kount=0; $coun=0; # Define the pattern for CTCF. Because of pseudocount, a wildcard is allowed at each position. my $pattern = "[ATGC][ATGC][ATGC][ATGC][ATGC][ATGC][ATGC][ATGC][ATGC][ATGC][ATGC][ATGC][ATGC][ATGC][ATGC][ATGC][ATGC][ATGC][ATGC]"; # Compare the pattern with the 19 bp nucleotide segments. while($array5 =~ m/$pattern/gi) { $coun++; $count2++; my $endpos = pos $array5; # Get the starting and ending positions of the matched pattern. my $startpos=($endpos+1)-19; my $lastpos=$endpos; my $consensus = substr($array5, ($startpos-1), 19); push(@temp, $consensus, $startpos, $lastpos); $coll=0; $kount++; print($endpos, "\n"); # Split the matched pattern into 19 single bases. @digit = split(//, $consensus); # For each base, if the base is A, calculate the weight score of A according to its frequency in the CTCF Position Frequency Matrix. foreach $digit (@digit) { if($digit =~ m/A/) { $marks = @matrix1[$coll]; $result1=sqrt(914); $result2=$result1*0.3; $result3=$result2+$marks; $result4=sqrt(914); $result5=$result4+914; $result6=0.3; $resultfinal=log($result3/$result5/$result6)/log(2); # Store the Weight score in an array. push(@scorearray, $resultfinal); } # For each base, if the base is C, calculate the weight score of C according to its frequency in the CTCF Position Frequency Matrix. if($digit =~ m/C/) { $marks = @matrix1[$coll+1]; $result1=sqrt(914); $result2=$result1*0.2; $result3=$result2+$marks; $result4=sqrt(914); $result5=$result4+914; $result6=0.2; $resultfinal=log($result3/$result5/$result6)/log(2); # Store the Weight score in an array. push(@scorearray, $resultfinal); } # For each base, if the base is G, calculate the weight score of G according to its frequency in the CTCF Position Frequency Matrix. if($digit =~ m/G/) { $marks = @matrix1[$coll+2]; $result1=sqrt(914); $result2=$result1*0.2; $result3=$result2+$marks; $result4=sqrt(914); $result5=$result4+914; $result6=0.2; $resultfinal=log($result3/$result5/$result6)/log(2); # Store the Weight score in an array. push(@scorearray, $resultfinal); } # For each base, if the base is T, calculate the weight score of T according to its frequency in the CTCF Position Frequency Matrix. if($digit =~ m/T/) { $marks = @matrix1[$coll+3]; $result1=sqrt(914); $result2=$result1*0.3; $result3=$result2+$marks; $result4=sqrt(914); $result5=$result4+914; $result6=0.3; $resultfinal=log($result3/$result5/$result6)/log(2); # Store the Weight score in an array. push(@scorearray, $resultfinal); } $coll=$coll + 4; } @digit=(); my $tem=0; # Add the weight scores in a variable. foreach $scorearray(@scorearray) { $tem = $tem+$scorearray; } @scorearray = (); my $fpercent = $tem; # Save CTCF sites, start and end positions, scores, and strand locations in outfile file provided they have an overall weight score of 18 and above. if ($fpercent >= 18) { my $append = 0; if ($append) { open(MYOUTFILE, ">output-reverse.txt"); #open for write, overwrite } else { open(MYOUTFILE, ">>output-reverse.txt"); #open for write, append } print MYOUTFILE $consensus, "\t", "chr3:",182999982 - $count2, "-", 183000000 - $count2, "\t", "$fpercent", "\t", "-", "\n"; close(MYOUTFILE); } }