#!/usr/bin/perl # repeatalyzer1.pl # usage: repeatalyzer.pl trf_output_file.txt # Perseus Missirlis # add -repMatch=10000 to gfServer use strict; use Descriptive; use Bio::EnsEMBL::DBSQL::DBAdaptor; use DBI; #################### # Global Variables # #################### my $host = 'db02.bcgsc.bc.ca'; my $user = 'ensembl'; my $port = 3306; my $dbname = 'homo_sapiens_core_19_34b'; my $litedb = 'homo_sapiens_lite_19_34b'; my $api_password = 'ensembl'; # DBI my ($dsn) = "DBI:mysql:microlog:athena.bcgsc.ca"; my ($user_name) = "microlog_rw"; my ($password) = "repeat"; my ($dbh); my (@ary); # initialize descriptive stats object my $stat; ####################### # Connect to Database # ####################### $dbh = DBI->connect ($dsn, $user_name, $password, { RaiseError => 1 }); ###################################### # Initialize SQL statement variables # ###################################### my $sth = $dbh->prepare ("INSERT INTO repeats VALUES(?,?,?,?,?,?,?,?,?,?)"); my $sth2 = $dbh->prepare ("INSERT INTO gc VALUES(?,?,?,?)"); my $select_ensname = $dbh->prepare ("SELECT e.ens_id, e.ens_name FROM ens_db e WHERE e.ens_name = ?"); my $insert_ensdb = $dbh->prepare ("INSERT INTO ens_db VALUES(?,?,?,?,?,?,?,?)"); my $insert_go = $dbh->prepare ("INSERT INTO go VALUES(?,?,?)"); my $insert_pdb = $dbh->prepare ("INSERT INTO pdb VALUES(?,?,?)"); my $insert_mim = $dbh->prepare ("INSERT INTO mim VALUES(?,?,?)"); my $select_genenote = $dbh->prepare ("SELECT g.g_id FROM genenote g WHERE g.id_ref = ?"); my $update_hugo = $dbh->prepare ("UPDATE ens_db SET name = ? WHERE ens_name = ?"); my $insert_affy = $dbh->prepare ("INSERT INTO affy VALUES(?,?,?)"); my $insert_ugcount = $dbh->prepare ("INSERT INTO ugcount VALUES(?,?,?,?,?)"); my $insert_ugstats = $dbh->prepare ("INSERT INTO ugstats VALUES(?,?,?,?,?,?,?)"); my $select_unigene = $dbh->prepare ("SELECT chr, start, end FROM unigene WHERE cluster = ?"); my $insert_transcripts; ######################################################### # Create Hash to classify repeats into repeat families # ######################################################### # Open file containing all distinct repeat_classes # see repeatalyzer.pl Methods for protocol # ouput from repeat_classer2.pl open(REP, "/home/perseusm/progs/repeat_classer/repeat_classes.txt") || die "Cannot open repeat_classes.txt"; my %classes; while() { if (/^(\d+)\s+(\S+)/) { my $rep_class = $1; my $rep_seq = $2; my @reps = split(/o/,$rep_seq); shift(@reps); foreach my $rep (@reps) { $classes{$rep} = $rep_class; } } } close(REP); ##################################################################### # Declare an anonymous hash of hashes to store UniGene cluster info # ##################################################################### my $clusters = {}; ################################################################################################### # Set-up BLAT server variables to query UniGene unique sequence clusters for repeat polymorphisms # ################################################################################################### my $hostname = `hostname`; $hostname =~ s/\s+//g; my @hostname = split(/\./,$hostname); $hostname = $hostname[0]; my $number = sprintf "%04d", rand 10_000; $number =~ s/0/1/g; my $blat_port = $number; my $blat_db = "/home/perseusm/unigene/Hs.seq.uniq2.2bit"; print "########################\n"; print "# Creating BLAT server #\n"; print "########################\n"; print " BLAT port: $blat_port BLAT host: $hostname "; # repMatch indicates the number of occurrences of a tile (nmer) that trigger repeat masking the tile # set it so that only huge stretches of repeats are masked my $status = `/home/perseusm/blat/gfServer -repMatch=10000 status $hostname $blat_port`; unless ($status) { # Issue system command to create BLAT server system("/home/perseusm/blat/set-up_blatServer.pl $hostname $blat_port $blat_db &"); } # Ensure BLAT server is up before proceeding while(1) { print "Checking server status...\n"; my $status = `/home/perseusm/blat/gfServer status $hostname $blat_port`; print "BLAT server status:$status\n"; last if ($status); sleep 10; } ####################################### # Ensembl API Variable Initialization # ####################################### # CONNECT TO ENSEMBL DATABASE my $db = new Bio::EnsEMBL::DBSQL::DBAdaptor ( -host => $host, -user => $user, -port => $port, -dbname => $dbname, -pass => $api_password, -litedbname => $litedb); my $slice_adaptor = $db->get_SliceAdaptor; ################################ # Insert into build_info table # ################################ my $date = `date`; #my $build_info = $dbh->prepare ("INSERT INTO build_info VALUES(?,?,?)"); #$build_info->execute ($host,$dbname,$date); ###################### # Read TRF flat file # ###################### my $chrom; while (<>) { chomp; if (/chr(\S+)/) { $chrom = $1; } elsif (/^(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\.\d+\s+(\d+)\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\.\d+\s+(\S+)\s+(\S+)/) { print "stdin: $&\n\n"; my $date = `date`; print "script starts at $date\n\n"; # pull out coords of interest my $chromStart= $1; my $chromEnd = $2; my $rptPeriod = $3; my $rptSize = $4; my $rptConsensus = $5; my $rptUnit = $6; my $rpt = $7; my $rptLength = length $rpt; my $rptPureLength = $rptPeriod*$rptSize; print "chrom $chrom chromStart $chromStart chromEnd $chromEnd rptPeriod $rptPeriod repConsensus $rptConsensus rptSize $rptSize rptUnit $rptUnit rpt $rpt rptLength $rptLength rptPureLength = $rptPureLength\n\n"; if ($rptSize > 1) { # initialize repeat co-ordinates relative to ensembl seq slice my $repStart = 0; my $repEnd = $rptLength; my $slice = $slice_adaptor->fetch_by_chr_start_end($chrom,$chromStart,$chromEnd); my $rptRegion = $slice->seq; my $rep_id; #################################################################################### # Check to see if repeats detected in UCSC chromosomes have same coords in ensembl # #################################################################################### my $date = `date`; print "gc calc start at $date\n\n"; if ($rpt eq $rptRegion) { } else { print "ERROR: $rep_id: repeats don't match\n"; print "chrom $chrom chromStart $chromStart chromEnd $chromEnd rptPeriod $rptPeriod repConsensus $rptConsensus rptSize $rptSize rptUnit $rptUnit rpt $rpt rptLength $rptLength\n\n"; } ########################## # Determine repeat class # ########################## my $rc_id = $classes{$rptUnit}; ######################################## # Insert repeat info into repeat table # ######################################## if (! defined $sth) { $sth = $dbh->prepare ("INSERT INTO repeats VALUES(?,?,?,?,?,?,?,?,?,?)") or die "Couldn't prepare statement: " . $dbh->errstr; } $sth->execute ("NULL",$chrom,$chromStart,$chromEnd,$rptPeriod,$rptUnit,$rc_id,$rpt,$rptSize,"NULL"); $rep_id = $sth-> {mysql_insertid}; ########################################### # Calculate flanking GC values for repeat # ########################################### my $gc_100 = countGC($chrom,$chromStart,$chromEnd,100,100); my $gc_500 = countGC($chrom,$chromStart,$chromEnd,500,500); my $gc_1000 = countGC($chrom,$chromStart,$chromEnd,1000,1000); ###################### # Insert to gc table # ###################### if (! defined $sth2) { $sth2 = $dbh->prepare ("INSERT INTO gc VALUES(?,?,?,?)") or die "Couldn't prepare statement: " . $dbh->errstr; } $sth2->execute ($rep_id,$gc_100,$gc_500,$gc_1000); ############# # Get Slice # ############# my $n = 0; my $gene_found = 'n'; my $adder; my $class_determined = 'n'; # print "n is $n genefound is $gene_found\n\n"; while (($gene_found eq 'n') && ($n < 5)) { my $date = `date`; print "while loop start at $date\n\n"; $adder = $n*15000; my $slice = $slice_adaptor->fetch_by_chr_start_end($chrom,$chromStart - $adder,$chromEnd + $adder); my $rptRegion = $slice->seq; # Initialize location variable my $location; ############################ # Get all genes from slice # ############################ my $genes = $slice->get_all_Genes; foreach my $gene(@$genes) { my $ens_id; print "this is n $n\n\n"; my $gene_type = $gene->type; if ($gene_type eq 'ensembl') { $gene_found = 'y'; my $ens_name = $gene->stable_id; # print $ens_name . "\n\n"; print "n is $n\n\n"; ################################## # Is the gene already in ens_db? # ################################## my $date = `date`; print "local gene test start at $date\n\n"; my $gene_exists = 'n'; if (! defined $select_ensname) { $select_ensname = $dbh->prepare ("SELECT e.ens_id, e.ens_name FROM ens_db e WHERE e.ens_name = ?") or die "Couldn't prepare statement: " . $dbh->errstr; } $select_ensname->execute ($ens_name); while ( my $href = $select_ensname->fetchrow_hashref ) { my $temp_ens = $href->{ens_name}; $ens_id = $href->{ens_id}; print "this is the ens_id number: $ens_id\n\n"; if ($ens_name eq $temp_ens) { # print "$ens_name already exists in the database\n\n"; $gene_exists = 'y'; } } my $date = `date`; print "local gene test end at $date\n\n"; ################################### # Get all Database links for gene # ################################### ######################################################################### # If the gene doesn't exist in schz_db then collect suppplementary info # ######################################################################### # print "gene_exists is $gene_exists\n\n"; if ($gene_exists eq 'n') { my $date = `date`; print "get all gene info start at $date\n\n"; my $gene_chr = $gene->chr_name; my $gene_start = $gene->start; my $gene_end = $gene->end; my $gene_strand = $gene->strand; my $slice_start = $slice->chr_start; my $slice_end = $slice->chr_end; my $gene_des = $gene->description; $gene_des =~ s/'//g; print "slice_start $slice_start slice_end $slice_end gene_start $gene_start gene_end $gene_end gene_des $gene_des\n\n"; unless($gene_des) { $gene_des = 'NULL'; } my $true_gene_start = ($slice_start + $gene_start - 1); my $true_gene_end = ($slice_end + $gene_end); my $true_strand; if ($gene_strand == 1) { $true_strand = "+"; } elsif ($gene_strand == -1 ) { $true_strand = "-"; } print "ens_name $ens_name, gene_des: $gene_des, gene_chr: $gene_chr, true_gene_start: $true_gene_start, true_gene_end: $true_gene_end, true_strand: $true_strand\n\n"; if (! defined $insert_ensdb) { $insert_ensdb = $dbh->prepare ("INSERT INTO ens_db VALUES(?,?,?,?,?,?,?,?)") or die "Couldn't prepare statement: " . $dbh->errstr; } $insert_ensdb->execute ("NULL",$ens_name,"NULL",$gene_des,$gene_chr,$true_gene_start,$true_gene_end,$true_strand); $ens_id = $insert_ensdb-> {mysql_insertid}; if ($gene->is_known) { ################################################################################# # Initialize @affy array to collect AffyMetrix tag information from GeneNote DB # # within NeuroLog. This seperate array is needed because we cannot have two # # simultaneous connections to NeuroLog # ################################################################################# my @affy; my $x = 0; my $i = 0; #################################################################################################### # Ensembl sometimes has duplicate records for a gene, use the $affy_test variable to test for this # #################################################################################################### my $affy_test; foreach my $link (@{$gene->get_all_DBLinks}) { my $database = $link->database; my $display_id = $link->display_id; if ($database eq 'GO') { $display_id =~ s/GO\://; if (! defined $insert_go) { $insert_go = $dbh->prepare ("INSERT INTO go VALUES(?,?,?)") or die "Couldn't prepare statement: " . $dbh->errstr; } $insert_go->execute ("NULL",$ens_id,$display_id); } elsif ($database eq 'PDB') { if (! defined $insert_pdb) { $insert_pdb = $dbh->prepare ("INSERT INTO pdb VALUES(?,?,?)") or die "Couldn't prepare statement: " . $dbh->errstr; } $insert_pdb->execute ("NULL",$ens_id,$display_id); } elsif ($database eq 'MIM') { if (! defined $insert_mim) { $insert_mim = $dbh->prepare ("INSERT INTO mim VALUES(?,?,?)") or die "Couldn't prepare statement: " . $dbh->errstr; } $insert_mim->execute ("NULL",$ens_id,$display_id); } elsif ($database =~ /(AFFY_HG_U95[ABCDE]$)/ ) { unless ($affy_test =~ /$display_id/) { $affy_test .= $display_id; print "insert into affy $1, this is the affy test var $affy_test\n\n"; ###################################################################################### # Check for the Affy Probe Set in the genenote table, and if it exists, use its g_id # ###################################################################################### if (! defined $select_genenote) { $select_genenote = $dbh->prepare ("SELECT g.g_id FROM genenote g WHERE g.id_ref = ?") or die "Couldn't prepare statement: " . $dbh->errstr; } my $g_id; $select_genenote->execute ($display_id); while ( my $href = $select_genenote->fetchrow_hashref ) { $g_id = $href->{g_id}; unless ($g_id) { print "ERROR: $display_id for $ens_name does not have an entry in the genenote database\n"; } else { print "g_id for $display_id is $g_id\n"; @affy[$x] = $g_id; } $x++; } } # print "this is the affy array: @affy\n\nLast x value is $x\n\n"; } elsif ($database eq 'HUGO') { if (! defined $update_hugo) { $update_hugo = $dbh->prepare ("UPDATE ens_db SET name = ? WHERE ens_name = ?") or die "Couldn't prepare statement: " . $dbh->errstr; } $update_hugo->execute ($display_id,$ens_name); } } my $i = 0; if (! defined $insert_affy) { $insert_affy = $dbh->prepare ("INSERT INTO affy VALUES(?,?,?)") or die "Couldn't prepare statement: " . $dbh->errstr; } while ($i < $x) { print "this is the value being inserted $affy[$i]\n\n"; $insert_affy->execute ("NULL",$ens_id,$affy[$i]); $i++; } } } my $date = `date`; print "get all gene info end at $date\n\n"; #print "n is $n\n\n"; if ($n == 0) { ####################################################################### # We now know that this is the first pass of the loop for this repeat # # We don't know which transcript classifies the repeat # # There may be many ts associated with repeat record # ####################################################################### my $strander = $gene->strand; my $transcripts = $gene->get_all_Transcripts; foreach my $transcript(@$transcripts) { my $date = `date`; print "transcript info start at $date\n\n"; my $trans_id = $transcript->stable_id; # print "transcriptid is $trans_id\n\n"; my $coding_start = $transcript->coding_start; my $coding_end = $transcript->coding_end; # print "coding start is $coding_start and coding end is $coding_end \n\n"; my %exon_structure; my $exons = $transcript->get_all_Exons ; foreach my $exon(@$exons) { my $exon_start = $exon->start; my $exon_end = $exon->end; my $exon_end = $exon->end; my $exon_seq = $exon->seq->seq; # print "$trans_id $exon_start $exon_end:\n\n$exon_seq\n\n"; ######################################## # Populate hash with exon co-ordinates # ######################################## $exon_structure{$exon_start} = $exon_end; } foreach my $k (sort { $a <=> $b } (keys %exon_structure)) { } my %five_utr; my %cds; my %three_utr; foreach my $k (sort { $a <=> $b } (keys %exon_structure)) { # print "k is $k next one $exon_structure{$k} \n"; if (($k < $coding_start) && ($exon_structure{$k} < $coding_start)) { # print "these are 5UTR $k $exon_structure{$k}\n"; $five_utr{$k} = $exon_structure{$k}; } elsif (($k < $coding_start) && ($exon_structure{$k} > $coding_start)) { # print "these are 5UTR $k $coding_start, followed by this is CDS $coding_start $exon_structure{$k}\n"; $five_utr{$k} = $coding_start; $cds{$coding_start} = $exon_structure{$k}; } elsif (($k >= $coding_start) && ($exon_structure{$k} <= $coding_end)) { # print "these are CDS $k $exon_structure{$k}\n"; $cds{$k} = $exon_structure{$k}; } elsif (($k < $coding_end) && ($exon_structure{$k} > $coding_end)) { # print "these are 3UTR $coding_end $exon_structure{$k}, followed by this is CDS $k $coding_end\n"; $cds{$k} = $coding_end; $three_utr{$coding_end} = $exon_structure{$k}; } elsif (($k > $coding_end) && ($exon_structure{$k} > $coding_end)) { # print "these are 3UTR $k $exon_structure{$k}\n"; $three_utr{$k} = $exon_structure{$k}; } } if ($strander == -1) { my %temp = %three_utr; %three_utr = %five_utr; %five_utr = %temp; } my $min_5utr; my $max_5utr; my $x=0; foreach my $k (sort { $a <=> $b } (keys %five_utr)) { if ($x == 0) { #print "5utr $k $five_utr{$k}\n\n"; $min_5utr = $k; $max_5utr = $five_utr{$k}; } elsif ($x > 0) { if ($k < $min_5utr) { $min_5utr = $k; } elsif ($five_utr{$k} > $max_5utr) { $max_5utr = $five_utr{$k}; } } $x++; } my $x=0; my $min_cds; my $max_cds; foreach my $k (sort { $a <=> $b } (keys %cds)) { print "cds $k $cds{$k}\n\n"; if ($x == 0) { $min_cds = $k; $max_cds = $cds{$k}; } elsif ($x > 0) { if ($k < $min_cds) { $min_cds = $k; } elsif ($cds{$k} > $max_cds) { $max_cds = $cds{$k}; } } $x++; } my $min_3utr; my $max_3utr; my $x=0; foreach my $k (sort { $a <=> $b } (keys %three_utr)) { #print "3utr $k $three_utr{$k}\n\n"; if ($x == 0) { $min_3utr = $k; $max_3utr = $three_utr{$k}; } elsif ($x > 0) { if ($k < $min_3utr) { $min_3utr = $k; } elsif ($three_utr{$k} > $max_3utr) { $max_3utr = $three_utr{$k}; } } $x++; } print "gene co-ords\n"; print "strand is $strander\n"; print "5utr: min $min_5utr - max $max_5utr\n"; print "cds start $coding_start cds end $coding_end\n"; print "3utr: min $min_3utr - max $max_3utr\n\n"; my $found = 'n'; ############################################ # Where in the gene is the repeat located? # ############################################ ############################################ # Is the repeat within the 5UTR boundaries # ############################################ if (($repStart >= $min_5utr) && ($repEnd <= $max_5utr)) { foreach my $k (sort { $a <=> $b } (keys %five_utr)) { if (($repStart >= $k) && ($repEnd <= $five_utr{$k})) { $location = "5utr"; $found = 'y'; } elsif ((($repStart < $k) && ($repEnd >= $k)) || (($repStart <= $five_utr{$k}) && ($repEnd > $five_utr{$k}))) { $location = "5utr-intron boundary"; $found = 'y'; } } if ($found eq 'n') { $location = "5utr-intron"; $found = 'y'; } } ########################################### # Is the repeat within the CDS boundaries # ########################################### elsif (($repStart >= $coding_start) && ($repEnd <= $coding_end)) { foreach my $k (sort { $a <=> $b } (keys %cds)) { if (($repStart >= $k) && ($repEnd <= $cds{$k})) { $location = "exon"; $found = 'y'; } elsif ((($repStart < $k) && ($repEnd > $k) && ($repEnd < $cds{$k})) || (($repStart > $k) && ($repStart < $cds{$k}) && ($repEnd > $cds{$k}))) { $location = "intron-exon boundary"; $found = 'y'; } if ($found eq 'n') { $location = "intron"; $found = 'y'; } } } ############################################ # Is the repeat within the 3UTR boundaries # ############################################ elsif (($repStart >= $min_3utr) && ($repEnd <= $max_3utr)) { foreach my $k (sort { $a <=> $b } (keys %three_utr)) { if (($repStart >= $k) && ($repEnd <= $three_utr{$k})) { $location = "3utr"; $found = 'y'; } elsif ((($repStart < $k) && ($repEnd > $k)) || (($repStart < $three_utr{$k}) && ($repEnd > $three_utr{$k}))) { $location = "3utr-intron-boundary"; $found = 'y'; } } if ($found eq 'n') { $location = "3utr-intron"; $found = 'y'; } } if (($strander == 1) && ($found eq 'n')) { ######################################### # Is the repeat on the 5UTR boundaries? # ######################################### # if gene has 5' and 3' UTRS if (($min_5utr) && ($max_5utr) && ($min_3utr) && ($max_3utr)) { print "gene has 5 and 3 utrs\n\n"; if (($repStart < $min_5utr) && ($repEnd > $min_5utr) && ($repEnd < $max_5utr)) { $location = "5utr-ncs"; $found = 'y'; } elsif (($repStart > $min_5utr) && ($repEnd < $coding_end)) { $location = "5utr-cds"; $found = 'y'; } elsif (($repStart > $coding_start) && ($repEnd < $max_3utr)) { $location = "3utr-cds"; $found = 'y'; } elsif (($repStart > $min_3utr) && ($repStart < $max_3utr) && ($repEnd > $max_3utr)) { $location = "3utr-ncs"; $found = 'y'; } } # if gene has 5' UTR elsif (($min_5utr) && ($max_5utr)) { print "gene has 5 utr\n\n"; if (($repStart < $min_5utr) && ($repEnd > $min_5utr) && ($repEnd < $max_5utr)) { $location = "5utr-ncs"; $found = 'y'; } elsif (($repStart > $min_5utr) && ($repEnd < $coding_end)) { $location = "5utr-cds"; $found = 'y'; } elsif (($repStart > $coding_start) && ($repStart < $coding_end) && ($repEnd > $coding_end)) { $location = "cds-ncs"; $found = 'y'; } } # if gene has 3' UTR elsif (($min_3utr) && ($max_3utr)) { print "gene has 3 utr\n\n"; if (($repStart < $coding_start) && ($repEnd > $coding_start) && ($repEnd < $coding_end)) { $location = "cds-ncs"; $found = 'y'; } elsif (($repStart > $coding_start) && ($repEnd < $max_3utr)) { $location = "3utr-cds"; $found = 'y'; } elsif (($repStart > $min_3utr) && ($repStart < $max_3utr) && ($repEnd > $max_3utr)) { $location = "3utr-ncs"; $found = 'y'; } } } elsif (($strander == -1) && ($found eq 'n')) { # if gene has 5' and 3' UTRS if (($min_5utr) && ($max_5utr) && ($min_3utr) && ($max_3utr)) { print "-ve strand gene has 5 and 3 utrs\n\n"; if (($repStart < $min_3utr) && ($repEnd > $min_3utr) && ($repEnd < $max_3utr)) { $location = "3utr-ncs"; $found = 'y'; } elsif (($repStart > $min_3utr) && ($repEnd < $coding_end)) { $location = "3utr-cds"; $found = 'y'; } elsif (($repStart > $coding_start) && ($repEnd < $max_5utr)) { $location = "5utr-cds"; $found = 'y'; } elsif (($repStart > $min_5utr) && ($repStart < $max_5utr) && ($repEnd > $max_5utr)) { $location = "5utr-ncs"; $found = 'y'; } } # if gene has 5' UTR elsif (($min_5utr) && ($max_5utr)) { print "-ve strand gene has 5 utr\n\n"; if (($repStart < $coding_start) && ($repEnd > $coding_start) && ($repEnd < $coding_end)) { $location = "cds-ncs"; $found = 'y'; } elsif (($repStart > $coding_start) && ($repEnd < $max_5utr)) { $location = "5utr-cds"; $found = 'y'; } elsif (($repStart > $min_5utr) && ($repStart < $max_5utr) && ($repEnd > $max_5utr)) { $location = "5utr-ncs"; $found = 'y'; } } # if gene has 3' UTR elsif (($min_3utr) && ($max_3utr)) { print "-ve strand gene has 3 utr\n\n"; if (($repStart < $min_3utr) && ($repEnd > $min_3utr) && ($repEnd < $max_3utr)) { $location = "3utr-ncs"; $found = 'y'; } elsif (($repStart > $min_3utr) && ($repEnd < $coding_end)) { $location = "3utr-cds"; $found = 'y'; } elsif (($repStart > $coding_start) && ($repStart < $coding_end) && ($repEnd > $coding_end)) { $location = "cds-ncs"; $found = 'y'; } } } # if gene has no UTR if ($found eq 'n') { print "-ve strand gene has no UTRs\n\n"; if ((($repStart < $coding_start) && ($repEnd > $coding_start) && ($repEnd < $coding_end)) || (($repStart > $coding_start) && ($repStart < $coding_end) && ($repEnd > $coding_end))) { $location = "cds-ncs"; $found = 'y'; } } if (($found eq 'y') && ($location)) { # print "location of repeat found: $found, location: $location\n\n"; print "into transcripts table: $rep_id, $trans_id, $location, $ens_name\n\n"; print "############################################\n"; print "############################################\n"; print "gene co-ords\n"; print "strand is $strander\n"; print "5utr: min $min_5utr - max $max_5utr\n"; print "cds start $coding_start cds end $coding_end\n"; print "3utr: min $min_3utr - max $max_3utr\n\n"; print "############################################\n"; foreach my $k (sort { $a <=> $b } (keys %five_utr)) { print "5utr $k $five_utr{$k}\n\n"; print "############################################\n"; print "############################################\n"; } #################################################################### # Determine Repeat instability as detected within UniGene Clusters # #################################################################### # test variable to see if forthcoming loop executed my $ug_stats = 'n'; print "this is ug_stats before loop $ug_stats\n\n"; # initialize variables to collect summary stats my $stat = Statistics::Descriptive::Full->new(); if (($location eq "exon") || ($location eq "5utr") || ($location eq "3utr")) { # initialize variables to collect summary stats my $i = 0; my $ug_cluster; my $ug_sequence; my $slice_adaptor = $db->get_SliceAdaptor; # Create slices my $padder = 10; # pad each repeat by this much flanking seq for BLAT query my $slice= $slice_adaptor->fetch_by_chr_start_end($chrom,$chromStart-$padder,$chromEnd+$padder); my $l_slice = $slice_adaptor->fetch_by_chr_start_end($chrom,$chromStart-$padder,$chromStart-1); my $r_slice = $slice_adaptor->fetch_by_chr_start_end($chrom,$chromEnd+1,$chromEnd+$padder); # Get flanking sequence my $l_seq = $l_slice->seq; my $r_seq = $r_slice->seq; my $seq = $slice->seq; my $rep = $slice->get_repeatmasked_seq->seq; print "seq is:\n$seq\n\n"; print "left seq is:\n$l_seq\n\n"; print "right seq is:\n$r_seq\n\n"; my $blat_seq = "$rep_id\.fa"; unless ( open(SEQ, ">$blat_seq") ) { die "Cannot open file \"$blat_seq\" to write to!\n\n"; } print SEQ ">$rep_id\|$chrom\:$chromStart\-$chromEnd\|$rptUnit\|$rptLength\n"; print SEQ "$seq"; close(SEQ); # Run BLAT query print "BLAT query\n\n"; print `/home/perseusm/blat/gfClient $hostname $blat_port / $rep_id.fa $rep_id`; my $blat_hits = "$rep_id"; print "open this file: $blat_hits\n\n"; unless ( open(HITS, "$blat_hits") ) { die "Cannot open file \"$blat_hits\" to write to!\n\n"; } while () { if (/^(\d+)\s+(\d+)\s+\d+\s+\d+\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+\S+\s+(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)/) { print " 1: $1 2: $2 3: $3 4: $4 5: $5 6: $6 7: $7 8: $8 9: $9 10: $10 11: $11 \n\n"; my $match = $1; my $mismatch = $2; my $qGapCount = $3; my $qGapBases = $4; my $tGapCount = $5; my $tGapBases = $6; my $query = $7; my $qSize = $8; my $qStart = $9; my $qEnd = $10; $ug_cluster = $11; $ug_sequence = $ug_cluster; $ug_cluster =~ s/^(Hs\.\S+)\|.*/\1/; $ug_sequence =~ s/^.*\|(Hs\#.*)/\1/; print "this is the cluster: $ug_cluster\n"; print "this is the sequence: $ug_sequence\n"; my $blatScore = $match - $mismatch - $qGapCount - $tGapCount; my $identity = ($match)/($match + $mismatch + $qGapCount); $identity = $identity * 100; $identity = sprintf "%.1f",$identity; print "identity: $identity score: $blatScore\n\n"; if (!($clusters->{$ug_cluster})) { ##################################### # Open up clusters to count repeats # ##################################### my $ugc_file_name = "/home/perseusm/unigene/ugc_fasta/$ug_cluster\.ugc"; open(REP, $ugc_file_name) || die "Cannot open $ugc_file_name"; my $seq; my $ugc = 0; my $dna; while() { if ($ugc == 0) { if (/^\>.*\|Hs\#(\S+)\ .*\/ug\=Hs\.(\d+)/) { $seq = $1; $ugc = $2; } elsif (/(^\S+)/) { $dna = $dna . $1; } } elsif ($ugc > 0) { if (/^\>.*\|Hs\#(\S+)\ .*\/ug\=Hs\.(\d+)/) { # print "in loop 2 seq is $1\t cluster is $2\n"; $clusters->{$ug_cluster}{$seq} = $dna; $seq = $1; $ugc = $2; $dna =~ s/.*//g; } elsif (/(^\S+)/) { $dna = $dna . $1; } } } # add the last cluster $clusters->{$ug_cluster}{$seq} = $dna; $ugc = "Hs." . $ugc; if ($ug_cluster ne $ugc) { print "ERROR: file name cluster id does not match cluster id within file!\n"; } } # return all the values in the cluster my $hash = $clusters->{$ug_cluster}; foreach my $ug_seq (keys %$hash) { print "$ug_seq\t" . "\n"; my $x = 0; my $hit = $hash->{$ug_seq}; my $repeat; my $count = 0; my $left_extra; my $right_extra; # count to make sure only one hit exists in the cluster while($hit =~ /($l_seq(\S*?)(($rptUnit)+)(\S*)$r_seq)/ig) { $x++; } # test to see how many hits in the cluster if ($x > 1) { print "MULTI_HIT ERROR $x hits found in sequence\n"; } elsif ($x == 1) { # if there is only only one hit, it's a clean cluster # proceed to counting the number of repeats in the hit while($hit =~ /($l_seq(\S*?)(($rptUnit)+)(\S*)$r_seq)/ig) { $left_extra = $2; $repeat = $3; $right_extra = $5; print "1 is $1 2 is $2 3 is $3 4 is $4 5 is $5\n"; print "left extra is $left_extra\n"; print "this is the repeat $repeat\n"; print "right extra is $right_extra\n\n"; } print "repeat is: $rpt, period length $rptPeriod\n"; print "length of left " . length($left_extra) . "\n"; print "length of right " . length($right_extra) . "\n"; if ((length($left_extra) < $rptPeriod) && (length($right_extra) < $rptPeriod)) { while($repeat =~ /($rptUnit)/ig) { $count++; } } } if ($count > 0) { print "HIT $ug_seq has $count $rptUnit\'s in it\n"; $select_unigene->execute ($ug_cluster); while ( my $href = $select_unigene->fetchrow_hashref ) { my $found_hit = 'n'; # test variable to see if hits match co-ords of unigene clusters my $cluster_chr = $href->{chr}; my $cluster_start = $href->{start}; my $cluster_end = $href->{end}; # pad the start and end variables to close matched to be count; my $padder = 10_000; if (length ($cluster_start > 10000)) { $cluster_start = $cluster_start - $padder; } $cluster_end = $cluster_end + $padder; my $test_chr = $chrom . "a"; # concatenate these values to a string to force string comparison $cluster_chr = $cluster_chr . "a"; if (($test_chr eq $cluster_chr) && ($chromStart > $cluster_start) && ($chromEnd < $cluster_end)) { $found_hit = 'y'; print "$rep_id hits $ug_cluster for real!\n"; print "##################################\n\n"; print "rep_id is $rep_id\n"; print "chr is $test_chr\n"; print "start is $chromStart\n"; print "end is $chromEnd\n"; print "cluster is $ug_cluster\n"; print "this is the cluster co-ords: $cluster_chr\:$cluster_start\-$cluster_end\n\n"; print "found_hit is $found_hit\n\n"; if (! defined $insert_ugcount) { $insert_ugcount = $dbh->prepare ("INSERT INTO ugcount VALUES(?,?,?,?,?)") or die "Couldn't prepare statement: " . $dbh->errstr; } if ($found_hit eq 'y') { $ug_stats = 'y'; $insert_ugcount->execute ("NULL",$rep_id,$ug_cluster,$ug_seq,$count); $stat->add_data($count); } } } } } $i++; } } close(HITS); # remove temporary files used for BLAT system("rm $rep_id\* -f"); if ($i == 0) { print "NO HITS for $ug_cluster\:$ug_sequence\n"; } elsif ($i == 1) { print "i: $i one hit!\n"; } elsif ($i > 1) { print "i: $i more than one hit!\n"; } } print "this is rep_id $rep_id and this is ug_stats $ug_stats\n\n"; if ($ug_stats eq 'y') { print "collecting UniGene hits descriptive stats\n"; if (! defined $insert_ugstats) { $insert_ugstats = $dbh->prepare ("INSERT INTO count VALUES(?,?,?,?,?,?,?)") or die "Couldn't prepare statement: " . $dbh->errstr; } my $count = $stat->count(); my $min = $stat->min(); my $max = $stat->max(); my $mean = $stat->mean(); # can't calculate sd if count only 1 my $sd; if ($count > 1) { $sd = $stat->standard_deviation(); } print "min is $min\n"; print "mean is $mean\n"; print "sd is $sd\n"; print "max is $max\n"; print "count is $count\n\n"; $insert_ugstats->execute ("NULL",$rep_id,$count,$min,$max,$mean,$sd); } ####################################### # Determine what the repeat codes for # ####################################### my $pepseq; if ($location eq "exon") { my $pep_length = length($transcript->translate()->seq()); print "this is the length: $pep_length\n\n"; my $trans_strand = $transcript->get_all_Exons->[0]->strand(); my $peptide = $transcript->translate(); my @coords = $transcript->genomic2pep(1,$rptLength,$trans_strand); $pepseq = ''; foreach my $coord (@coords) { if($coord->isa('Bio::EnsEMBL::Mapper::Gap')) { print "ERROR: GAP, should not exist in coding region\n"; print $transcript->stable_id . "\n"; print $coord->start(), '-', $coord->end(), "\n"; } else { # in cases where the repeat ends at a stop codon, the code breaks because the end is greater than the total length of the peptide sequence # the following conditional processing code checks for this my $start = $coord->start(); my $end = $coord->end(); if ($end > $pep_length) { $pepseq .= $peptide->subseq($start,$pep_length); } elsif ($end < $pep_length) { $pepseq .= $peptide->subseq($start,$end); } } } } print "this is pepseq out of loop: $pepseq\n\n"; print "prior to insertion: rep_id: $rep_id, trans_id: $trans_id, location: $location, pepseq: $pepseq, ens_id: $ens_id\n"; $insert_transcripts = $dbh->prepare ("INSERT INTO transcripts VALUES(NULL,'$rep_id','$trans_id','$location','$pepseq','$ens_id')"); $insert_transcripts->execute (); $insert_transcripts = $dbh->disconnect(); } elsif ($found eq 'n') { print "$rep_id repeat not within gene boundaries of $trans_id\n\n"; } my $date = `date`; print "transcript info end at $date\n\n"; } # print "end of loop 1"; } } print "this is the gene found : $gene_found, and the n: $n\n\n"; if (($gene_found eq 'y') && ($n > 0)) { $insert_transcripts = $dbh->prepare ("INSERT INTO transcripts VALUES(NULL,'$rep_id',$trans_id,'$adder',NULL,'$ens_id')"); $insert_transcripts->execute (); $insert_transcripts = $dbh->disconnect(); } # print "end of loop 2"; } $n++; } my $date = `date`; print "while loop end at $date\n\n"; } } } ################################ # Disconnect from the database # ################################ print "disconnecting from the database\n\n"; $dbh->disconnect or warn "Disconnection failed: $DBI::errstr\n"; ######################## # Shutdown BLAT server # ######################## print "shutting down BLAT server\n"; print `/home/perseusm/blat/gfServer stop $hostname $blat_port`; ############### # Subroutines # ############### sub countGC { my($chrom,$chromStart,$chromEnd,$left,$right) = @_; print "in sub chrom is $chrom, chromStart is $chromStart and chromEnd is $chromEnd\n\n"; my $slice_left= $slice_adaptor->fetch_by_chr_start_end($chrom,$chromStart-$left,$chromStart-1); my $slice_right= $slice_adaptor->fetch_by_chr_start_end($chrom,$chromEnd+1,$chromEnd+$right); my $left_seq= $slice_left->seq; my $right_seq = $slice_right->seq; my $gc_seq = $left_seq . $right_seq; my $g=0; my $c=0; my $gc_length= length($gc_seq); while($gc_seq =~ /g/ig){$g++} while($gc_seq =~ /c/ig){$c++} my $gc = ($g+$c)/$gc_length; return($gc); }