Source code for origli.utilities.burn_in_utilities

#!/usr/bin/env python

import time
import h5py
from gwpy.timeseries import TimeSeries, TimeSeriesDict
from gwpy.timeseries import TimeSeries
import pandas as pd
from math import floor
import concurrent.futures
import itertools as itrtls
import os
import sys
import numpy as np
from scipy import stats
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

sys.path.append(os.path.join(os.path.dirname(__file__), '..'))

import origli.utilities.const as Const # for multi frequency bands defined in
from origli.utilities.utilities import select_some_trials
from origli.utilities.utilities import IdentifyGlitch, FindRadomlistPoints


#------------------------------
# for selecting the null samples
#--------------------------

[docs]def FindBGlis_extendBG(state, number_trials, step, outputMother_dir, df, Epoch_lt, TargetGlitchClass, IFO, BGSNR_thre, targetSNR_thre, Confidence_thre, UpperDurationThresh, LowerDurationThresh, UserDefinedDuration, gap, TriggerPeakFreqLowerCutoff=0, TriggerPeakFreqUpperCutoff=8192, targetUpperSNR_thre=np.inf): ''' description: get the null samples where their durations are drawn from the target set. The only subset of the null samples are chosen if their on-source windows do not coincide with any other glitches. 1. gltich file (.csv file) 2. the target samples 3. create random time stamps with their durations drawn from the target set 4. accept the random time stamps where their on-source windows do not coincide with any other glitches 5. return the info of the accepted glitches USAGE: Listsegments = FindBGlis_extendBG(state, number_trials, step, outputMother_dir, df, Epoch_lt, TargetGlitchClass, IFO, BGSNR_thre, targetSNR_thre, Confidence_thre, UpperDurationThresh, LowerDurationThresh, UserDefinedDuration, gap, TriggerPeakFreqLowerCutoff, TriggerPeakFreqUpperCutoff, targetUpperSNR_thre) :param df: GravitySpy meta data in pandas format :param Epochstart: starting time of an epoch :param Epochend: end time of an epoch :param Commissioning_lt: commissioning time in list :param TargetGlitchClass: a target glitch class name (str) :param IFO: a type of interferometer (H1, L1, V1) (str) :param BGSNR_thre: an upper threhold of SNR for background glitches, i.e., quiet enough ), float or int :param targetSNR_thre: a lower threshold of SNR for target glitches, float or int :param Confidence_thre: a threshold of confidence level (float or int ) :param UpperDurationThresh: an upper bound of duration in sec (float or int) :param LowerDurationThresh: a lower bound of duration in sec (float or int) :param UserDefinedDuration: user defined duration of a glitch (float or int), 0 in default :param gap: a time gap between the target and the background segments in sec, 1 sec in default :param TriggerPeakFreqUpperCutoff: the lower SNR threshold for selecting the target set :param targetUpperSNR_thre: the upper SNR threshold for selecting the target set :return: the list of parameters of glitches passing the above thresholds Listsegments contains of ListIndexSatisfied: a list of index of glitches Listtarget_timeseries_start: a list of a target glitch starting time Listtarget_timeseries_end: a list of a target glitch ending time Listpre_background_start: a list of preceding background starting time Listpre_background_end; a list of preceding background ending time Listfol_background_start: a list of following background starting time Listfol_background_end: a list of following background ending time Listgpstime: a list of GPS times Listduration: a list of durations ListSNR: a list of SNRs Listconfi: a list of confidence levels ListID: a list of IDs ''' df_list = [] # take the data in an Epoch for Epoch in Epoch_lt: StartT, EndT = Epoch[0], Epoch[1] df_epoch = df[(df['GPStime'] >= StartT) & (df['GPStime'] <= EndT)] df_list.append(df_epoch) df_int = pd.concat(df_list) # data set containing only a target glitch class try: # gravity spy and omicron trigger mode df_target = df_int[(df_int['label'] == TargetGlitchClass) & (df_int['ifo'] == IFO) & (df_int['confidence'] >= Confidence_thre) & (df_int['snr'] >= targetSNR_thre) & (df_int['snr'] <= targetUpperSNR_thre) & (df_int['peakFreq'] >= TriggerPeakFreqLowerCutoff) & (df_int['peakFreq'] <= TriggerPeakFreqUpperCutoff) & (df_int['duration'] >= LowerDurationThresh) & (df_int['duration'] <= UpperDurationThresh)] except KeyError: # pyCBC mode as there is no column "peakFreq" df_target = df_int[(df_int['label'] == TargetGlitchClass) & (df_int['ifo'] == IFO) & (df_int['confidence'] >= Confidence_thre) & (df_int['snr'] >= targetSNR_thre) & (df_int['snr'] <= targetUpperSNR_thre) & (df_int['centralFreq'] >= TriggerPeakFreqLowerCutoff) & (df_int['centralFreq'] <= TriggerPeakFreqUpperCutoff) & (df_int['duration'] >= LowerDurationThresh) & (df_int['duration'] <= UpperDurationThresh)] #for cWB #df_target = df_int[(df_int['rho'] > 9) & (df_int['label'] == TargetGlitchClass) & (df_int['ifo'] == IFO) & (df_int['confidence'] >= Confidence_thre) & (df_int['snr'] >= targetSNR_thre) & (df_int['snr'] <= targetUpperSNR_thre) & (df_int['centralFreq'] >= TriggerPeakFreqLowerCutoff) & (df_int['centralFreq'] <= TriggerPeakFreqUpperCutoff) & (df_int['duration'] >= LowerDurationThresh) & (df_int['duration'] <= UpperDurationThresh)] # generate synthetic data points df_random = FindRadomlistPoints(state, IFO, Epoch_lt, number_trials, step, outputMother_dir, df_target[df_target.snr > BGSNR_thre]) #df_random = FindShiftedPoints(state, IFO, Epoch_lt, number_trials, step, outputMother_dir, df_target[df_target.snr > BGSNR_thre]) df_int = df_int.append(df_random, ignore_index=True) # append the dataset of random points #### improvement for duration 2019-07-02 ############# #### full durations are too long for capturing the glitch signature so that the solution is to choose T90 ### In this situation, a gap that is manually chosen will be gap_new = gap_m + 0.05*T100 ### This function is for getting quite segments so that I concervatively choose the longest fraction of duration, in default FractionEnergy = 1 # (100% of energy ) # note (1 - FractionEnergy)/2. is the fraction of the one-sided off T90 df_int = df_int.assign(gap=gap + (1 - FractionEnergy) / 2. * df_int['duration']) df_int['duration'] = np.round(FractionEnergy * df_int['duration'], 3) # replace full durations (T100) with T90 # make the time segments of glitches measured # I reserve duration +/- gap around a GPS time of a glitch if UserDefinedDuration > 0: # accept a user defined duration # assign this user defined duration for any types of glieches observed in a given IFO df_int.loc[(df_int['ifo'] == IFO), ['duration']] = UserDefinedDuration # df_int.loc[(df_int['ifo'] == IFO) & (df_int['label'] == 'null'), ['duration']] = UserDefinedDuration else: # otherwise, keep the duration that are determined by an event trigger generator (ETG) pass df_int = df_int.assign(reserved_time_start=df_int['GPStime'] - df_int['duration'] / 2. - df_int['gap']) df_int = df_int.assign(reserved_time_end=df_int['GPStime'] + df_int['duration'] / 2. + df_int['gap']) # data set of randomly chosen background df_glitch = df_int[df_int.label == 'null'] # synthetic background trigger has SNR of zero # exclude glitches with SNR <= BGSNR from df_int as they are quiet enough to be considered as background df_int = df_int[df_int['snr'] > BGSNR_thre] print('all the glitches in the class: {}'.format(df_glitch.shape[0])) # the list of values satisfied as secure ListIndexSatisfied = [] Listtarget_timeseries_start = [] Listtarget_timeseries_end = [] Listpre_background_start = [] Listpre_background_end = [] Listfol_background_start = [] Listfol_background_end = [] Listgpstime = [] Listduration = [] ListSNR = [] Listconfi = [] ListID = [] for ind in range(len(df_glitch)): # target glitch's peak GPS time and the duration gpstime = df_glitch['GPStime'].iloc[ind] # a peak GPS time duration = df_glitch['duration'].iloc[ind] # # a duration SNR = df_glitch['snr'].iloc[ind] # snr confi = df_glitch['confidence'].iloc[ind] # confidence level ID = df_glitch['id'].iloc[ind] # ID gap_individual = df_glitch['gap'].iloc[ind] if duration > 0.02: # accept the duration of glitch is greater than 0.02 sec pass # go to the next line else: # if the duration of a glitch is less than 0.01 sec continue # go to the next iteration # the starting and ending time of the target glitch # at this moment I assume that the target glitch is distributed evenly around the GPS time target_timeseries_start = gpstime - duration / 2. target_timeseries_end = target_timeseries_start + duration # take the background # the background is around the target glitch # preceding background, which is a nearest neighbor segment before target_timeseries_start # following background, which is a nearest neighbor segment after target_timeseries_end background_duration = 64 # duration # duration of both the proceeding and following background pre_background_start = target_timeseries_start - gap_individual - background_duration pre_background_end = pre_background_start + background_duration # target_timeseries_start - gap_individual fol_background_start = target_timeseries_end + gap_individual fol_background_end = fol_background_start + background_duration # target_timeseries_end + gap_individual + background_duration # security check of the preceding background # case 1 is where the background is coincident with the right edge of the reserved segment # case 2 is where the background is coincident with the left edge of the reserved segment # case 3 is where the reserved segment is inside of the background # case 4 is where the background is inside of the reserved segment # it violates the security of the background estimation if at least one of the cases occurs # i.e., pro_case1, 2, 3, and 4 should be zero case1 = len(df_int[(df_int['reserved_time_end'] > target_timeseries_start) & (df_int['reserved_time_end'] < target_timeseries_end)]) # case 1 case2 = len(df_int[(df_int['reserved_time_start'] > target_timeseries_start) & (df_int['reserved_time_start'] < target_timeseries_end)]) # case 2 case3 = len(df_int[(df_int['reserved_time_start'] > target_timeseries_start) & (df_int['reserved_time_end'] < target_timeseries_end)]) # case 3 case4 = len(df_int[(df_int['reserved_time_start'] < target_timeseries_start) & (df_int['reserved_time_end'] > target_timeseries_end)]) # case 4 if case1 == 0 and case2 == 0 and case3 == 0 and case4 == 0: print('{:.1f}, {:.1f}, {:.1f}'.format(pre_background_end - pre_background_start, fol_background_end - fol_background_start, duration)) ListIndexSatisfied.append(ind) # append the index satisfied as secure Listtarget_timeseries_start.append(target_timeseries_start) Listtarget_timeseries_end.append(target_timeseries_end) Listpre_background_start.append(pre_background_start) Listpre_background_end.append(pre_background_end) Listfol_background_start.append(fol_background_start) Listfol_background_end.append(fol_background_end) Listgpstime.append(gpstime) Listduration.append(duration) ListSNR.append(SNR) Listconfi.append(confi) ListID.append(ID) else: continue # zipped list of segments Listsegments = list( zip(ListIndexSatisfied, Listtarget_timeseries_start, Listtarget_timeseries_end, Listpre_background_start, Listpre_background_end, Listfol_background_start, Listfol_background_end, Listgpstime, Listduration, ListSNR, Listconfi, ListID)) return Listsegments
#----------------------------------------------------------------------------------------------------------- # For "burn-in" phase # The following functions are for making the background upper threshold for the target and null samples #----------------------------------------------------------------------------------------------------------- #===== select ====
[docs]def FindRadomlistPointsForBurnIn(state, IFO, Epoch_lt, number_samples, step, duration_max, outputMother_dir): ''' description: create random time stamps with their durations are uniformly distributed in log10 of 0.02 sec to duration_max 1. within an epoch, create a list of synthetic points with randomly chosen with durations 2. make pandas frame dataset USAGE: df = FindRadomlistPointsForBurnIn(state, IFO, Epoch_lt, number_samples, step, duration_max, outputMother_dir) :param state: IFO state {observing, nominal-lock} :param IFO: an observer {H1, L1} :param Epoch_lt: a list of epochs :param number_samples: number of samples picked up :param step: step of data points in sec :param duration_max: a maximum value of the duration in sec :param outputMother_dir: an output directory in witch the data set is in :return: df: synthetic random data points within an epoch ''' # -----------------------------------------------------------------------------------------------------+ # get a list of start and end GPS times of an epoch with a given state {'observing', 'nominal-lock'} | # use IdentifyGlitch.FindObservingTimeSegments() | # -----------------------------------------------------------------------------------------------------+ # get a glitch identifier object indentify = IdentifyGlitch() # get start time and end time of an epoch startT, endT = Epoch_lt[0][0], Epoch_lt[-1][-1] # get observing segments SegmentsMat, trigger_dir = indentify.FindObservingTimeSegments(IFO, startT, endT, outputMother_dir, state) if state == 'nominal-lcok': SegmentsMat[:,0] = SegmentsMat[:, 0] + 300 # remove 5 min after the observing mode begins SegmentsMat[:,1] = SegmentsMat[:, 1] - 60 # remove 1 min before the observing mode ends elif state == 'observing': pass total_seg_used = (SegmentsMat[:,1] - SegmentsMat[:,0]).sum() if number_samples > int(total_seg_used/float(step)): # the number of samples greater than the maximum number of possible samples number_samples = int(total_seg_used/float(step)) # set number of samples equal to the maximum number of possible samples print('User selected number of synthetic samples is greater than the maximum number of possible samples') print('Set the number of samples equal to the maximum number = {}'.format(number_samples)) #-----------------------------------------------------+ # generate synthetic list of center times of samples | #-----------------------------------------------------+ whole_list_center_times = np.array([]) for start_seg_used, end_seg_used in SegmentsMat: length_seg_used = end_seg_used - start_seg_used # length of a segment in sec number_picked = int(number_samples * length_seg_used / float(total_seg_used)) # number of points to be selected in this segment # chop this segment by a given time step, then select "number_picked" points out of "length_seg_used/step" points, without duplication selected_index_center_times = np.random.choice(int(length_seg_used/float(step)), number_picked+1, replace=False) # 1 comes to pick samples a little more than the request at this moment as the sample size might be smaller than than the setting # calculate a list of center times list_center_times = np.array([start_seg_used + step * selected_index_center_time for selected_index_center_time in selected_index_center_times]) # sort this list in ascendingly list_center_times = np.sort(list_center_times) # append this list of center times to the whole list whole_list_center_times = np.append(whole_list_center_times, list_center_times) #whole_list_center_times = whole_list_center_times[:number_samples] # make the sample size equal to the request # randomly choose the timestamps where the size of the sample equal to the request whole_list_center_times = np.random.choice(whole_list_center_times, number_samples, replace=False) # ascending order whole_list_center_times = np.sort(whole_list_center_times) #--------------------------------------+ # generate synthetic list of durations | #--------------------------------------+ duration_min = 0.02 # the minimum duration is set to 0.02 s. This is ok as in this program analysis glitches with duration gerater than 0.01 s gen_d = 10 ** np.random.uniform(np.log10(duration_min), np.log10(duration_max), number_samples) # make a pandas dataframe that is compatible with a standard format in my program df = pd.DataFrame() df = df.assign(GPStime=whole_list_center_times) df = df.assign(startGPStime=np.zeros(number_samples)) df = df.assign(peakFreq=np.zeros(number_samples)) df = df.assign(snr=np.zeros(number_samples)) df = df.assign(amplitude=np.zeros(number_samples)) df = df.assign(centralFreq=np.zeros(number_samples)) df = df.assign(duration=gen_d) df = df.assign(bandwidth=np.zeros(number_samples)) df = df.assign(chisq=np.zeros(number_samples)) df = df.assign(chisqDof=np.zeros(number_samples)) df = df.assign(confidence=np.zeros(number_samples)) df = df.assign(id=[None] * number_samples) df = df.assign(ifo=['{}'.format(IFO)] * number_samples) df = df.assign(label=['null'] * number_samples) df = df.assign(imgUrl=[None] * number_samples) output_path = os.path.join(trigger_dir, 'RandomSample.csv') df.to_csv(output_path) # save the data sample df = pd.read_csv(output_path) # load the file df.loc[:, 'label'] = 'null' # this is py3 issue, the label of 'null' is read as NaN return df
[docs]def FindBGlistBurnIn(state, duration_max, number_trials, step, outputMother_dir, df, Epoch_lt, IFO, BGSNR_thre, UserDefinedDuration, gap): ''' description: From the random time stamps created with FindRadomlistPointsForBurnIn(), select the subset in which their on-source windows do not overlap with any other glitches Note that if on-source window can be extended, it will do 1. load glitch data set (.csv file) 4. accept time stamps where their on-source windows do not coincide with any other glitches 5. return the info of the accepted glitches USAGE: Listsegments = FindBGlistBurnIn(state, duration_max, number_trials, step, outputMother_dir, df, Epoch_lt, IFO, BGSNR_thre, UserDefinedDuration, gap) :param df: GravitySpy meta data in pandas format :param Epochstart: starting time of an epoch :param Epochend: end time of an epoch :param Commissioning_lt: commissioning time in list :param TargetGlitchClass: a target glitch class name (str) :param IFO: a type of interferometer (H1, L1, V1) (str) :param BGSNR_thre: an upper threhold of SNR for background glitches, i.e., quiet enough ), float or int :param targetSNR_thre: a lower threshold of SNR for target glitches, float or int :param Confidence_thre: a threshold of confidence level (float or int ) :param UpperDurationThresh: an upper bound of duration in sec (float or int) :param LowerDurationThresh: a lower bound of duration in sec (float or int) :param UserDefinedDuration: user defined duration of a glitch (float or int), 0 in default :param gap: a time gap between the target and the background segments in sec, 1 sec in default :param flag: 'Both' or 'Either' taking both backgronds or either the preceding or the following background, respectively to accept glitches :return: the list of parameters of glitches passing the above thresholds Listsegments contains of ListIndexSatisfied: a list of index of glitches Listtarget_timeseries_start: a list of a target glitch starting time Listtarget_timeseries_end: a list of a target glitch ending time Listpre_background_start: a list of preceding background starting time Listpre_background_end; a list of preceding background ending time Listfol_background_start: a list of following background starting time Listfol_background_end: a list of following background ending time Listgpstime: a list of GPS times Listduration: a list of durations ListSNR: a list of SNRs Listconfi: a list of confidence levels ListID: a list of IDs ''' df_list = [] # take the data in an Epoch for Epoch in Epoch_lt: StartT, EndT = Epoch[0], Epoch[1] df_epoch = df[(df['GPStime'] >= StartT) & (df['GPStime'] <= EndT)] df_list.append(df_epoch) df_int = pd.concat(df_list) # # data set containing only a target glitch class # try: # gravity spy and omicron trigger mode # df_target = df_int[(df_int['label'] == TargetGlitchClass) & (df_int['ifo'] == IFO) & (df_int['confidence'] >= Confidence_thre) & (df_int['snr'] >= targetSNR_thre) & (df_int['snr'] <= targetUpperSNR_thre) & (df_int['peakFreq'] >= TriggerPeakFreqLowerCutoff) & (df_int['peakFreq'] <= TriggerPeakFreqUpperCutoff) & (df_int['duration'] >= LowerDurationThresh) & (df_int['duration'] <= UpperDurationThresh)] # except KeyError: # pyCBC mode as there is no column "peakFreq" # df_target = df_int[(df_int['label'] == TargetGlitchClass) & (df_int['ifo'] == IFO) & (df_int['confidence'] >= Confidence_thre) & (df_int['snr'] >= targetSNR_thre) & (df_int['snr'] <= targetUpperSNR_thre) & (df_int['duration'] >= LowerDurationThresh) & (df_int['duration'] <= UpperDurationThresh)] # generate synthetic data points df_random = FindRadomlistPointsForBurnIn(state, IFO, Epoch_lt, number_trials, step, duration_max, outputMother_dir) df_int = df_int.append(df_random, ignore_index=True) # append the dataset of random points #### improvement for duration 2019-07-02 ############# #### full durations are too long for capturing the glitch signature so that the solution is to choose T90 ### In this situation, a gap that is manually chosen will be gap_new = gap_m + 0.05*T100 ### This function is for getting quite segments so that I concervatively choose the longest fraction of duration, in default FractionEnergy = 1 # (100% of energy ) # note (1 - FractionEnergy)/2. is the fraction of the one-sided off T90 df_int = df_int.assign(gap=gap + (1 - FractionEnergy) / 2. * df_int['duration']) df_int['duration'] = np.round(FractionEnergy * df_int['duration'], 3) # replace full durations (T100) with T90 # make the time segments of glitches measured # I reserve duration +/- gap around a GPS time of a glitch if UserDefinedDuration > 0: # accept a user defined duration # assign this user defined duration for any types of glieches observed in a given IFO df_int.loc[(df_int['ifo'] == IFO), ['duration']] = UserDefinedDuration else: # otherwise, keep the duration that are determined by an event trigger generator (ETG) pass df_int = df_int.assign(reserved_time_start=df_int['GPStime'] - df_int['duration'] / 2. - df_int['gap']) df_int = df_int.assign(reserved_time_end=df_int['GPStime'] + df_int['duration'] / 2. + df_int['gap']) # data set of rondomly chosen background df_glitch = df_int[df_int.label == 'null'] # synthetic background trigger has SNR of zero # exclude glitches with SNR <= BGSNR from df_int as they are quiet enough to be considered as background df_int = df_int[df_int['snr'] > BGSNR_thre] print('all the glitches in the class: {}'.format(df_glitch.shape[0])) # the list of values satisfied as secure ListIndexSatisfied = [] Listtarget_timeseries_start = [] Listtarget_timeseries_end = [] Listpre_background_start = [] Listpre_background_end = [] Listfol_background_start = [] Listfol_background_end = [] Listgpstime = [] Listduration = [] ListSNR = [] Listconfi = [] ListID = [] for ind in range(len(df_glitch)): # target glitch's peak GPS time and the duration gpstime = df_glitch['GPStime'].iloc[ind] # a peak GPS time duration = df_glitch['duration'].iloc[ind] # # a duration SNR = df_glitch['snr'].iloc[ind] # snr confi = df_glitch['confidence'].iloc[ind] # confidence level ID = df_glitch['id'].iloc[ind] # ID gap_individual = df_glitch['gap'].iloc[ind] if duration > 0.02: # accept the duration of glitch is greater than 0.02 sec pass # go to the next line else: # if the duration of a glitch is less than 0.02 sec continue # go to the next iteration # the starting and ending time of the target glitch # at this moment I assume that the target glitch is distributed evenly around the GPS time target_timeseries_start = gpstime - duration / 2. target_timeseries_end = target_timeseries_start + duration # take the background # the background is around the target glitch # preceding background, which is a nearest neighbor segment before target_timeseries_start # following background, which is a nearest neighbor segment after target_timeseries_end background_duration = 64 # duration # duration of both the proceeding and following background pre_background_start = target_timeseries_start - gap_individual - background_duration pre_background_end = pre_background_start + background_duration # target_timeseries_start - gap_individual fol_background_start = target_timeseries_end + gap_individual fol_background_end = fol_background_start + background_duration # target_timeseries_end + gap_individual + background_duration # security check of the preceding background # case 1 is where the background is coincident with the right edge of the reserved segment # case 2 is where the background is coincident with the left edge of the reserved segment # case 3 is where the reserved segment is inside of the background # case 4 is where the background is inside of the reserved segment # it violates the security of the background estimation if at least one of the cases occurs # i.e., pro_case1, 2, 3, and 4 should be zero case1 = len(df_int[(df_int['reserved_time_end'] > target_timeseries_start) & (df_int['reserved_time_end'] < target_timeseries_end)]) # case 1 case2 = len(df_int[(df_int['reserved_time_start'] > target_timeseries_start) & (df_int['reserved_time_start'] < target_timeseries_end)]) # case 2 case3 = len(df_int[(df_int['reserved_time_start'] > target_timeseries_start) & (df_int['reserved_time_end'] < target_timeseries_end)]) # case 3 case4 = len(df_int[(df_int['reserved_time_start'] < target_timeseries_start) & (df_int['reserved_time_end'] > target_timeseries_end)]) # case 4 if case1 == 0 and case2 == 0 and case3 == 0 and case4 == 0: try: # smallest time gap between the start of a target reserved time and the nearest reserved time of others that are before the target neareset_gap_pre = (target_timeseries_start - df_int[df_int['reserved_time_end'] < target_timeseries_start]['reserved_time_end']).sort_values()[:1].values[0] if neareset_gap_pre > background_duration - 5: # # if the smallest gap is larger than the background duration neareset_gap_pre = background_duration - 5 # replace it else: pass target_timeseries_start = target_timeseries_start - neareset_gap_pre # replace except IndexError: # there is no nearest glitch that satisfies the above condition pass try: # smallest time gap between the end of a target reserved time and the nearest reserved time of others that are after the target neareset_gap_fol = (df_int[df_int['reserved_time_start'] > target_timeseries_end]['reserved_time_start'] - target_timeseries_end).sort_values()[:1].values[0] if neareset_gap_fol > background_duration - 5: # # if the smallest gap is larger than the background duration neareset_gap_fol = background_duration -5 # replace it else: pass target_timeseries_end = target_timeseries_end + neareset_gap_fol # replace except IndexError: # there is no nearest glitch that satisfies the above condition pass # update duration duration = round(target_timeseries_end - target_timeseries_start, 3) # #print('{:.1f}, {:.1f}, {:.1f}'.format(neareset_gap_pre, neareset_gap_fol, duration)) print('{:.1f}, {:.1f}, {:.1f}'.format(pre_background_end - pre_background_start, fol_background_end - fol_background_start, duration)) ListIndexSatisfied.append(ind) # append the index satisfied as secure Listtarget_timeseries_start.append(target_timeseries_start) Listtarget_timeseries_end.append(target_timeseries_end) Listpre_background_start.append(pre_background_start) Listpre_background_end.append(pre_background_end) Listfol_background_start.append(fol_background_start) Listfol_background_end.append(fol_background_end) Listgpstime.append(gpstime) Listduration.append(duration) ListSNR.append(SNR) Listconfi.append(confi) ListID.append(ID) else: continue # zipped list of segments Listsegments = list( zip(ListIndexSatisfied, Listtarget_timeseries_start, Listtarget_timeseries_end, Listpre_background_start, Listpre_background_end, Listfol_background_start, Listfol_background_end, Listgpstime, Listduration, ListSNR, Listconfi, ListID)) return Listsegments
# ==== save ====
[docs]def clean_duration_for_asd(duration): ''' description: This function is used for clean the dicimal points in value of duration to avoid the error shown in ASD estimator in gwpy :param duration: duration in sec :return: duration: cleaned duration in sec ''' fourth_decimal_num_of_duration = (duration * 1000 - floor(duration * 1000)) * 10 # fourth decimal number duration = duration - fourth_decimal_num_of_duration * 0.0001 # truncate at the fourth decimal number # revise the duration to make an error in whitening process not not to occur # ValueError occurs in the whitening when the lengths of whitened data, whitening data and the filter are different third_decimal_num_of_duration = round((round(duration, 3) - round(duration, 2)) * 1000) if third_decimal_num_of_duration % 2 == 1: # odd number at the third decimal point duration = duration - 0.001 # make the duration with an even number at the third decimal point elif third_decimal_num_of_duration % 2 == 0: # even number at the third decimal point pass # keep the original duration as it is duration = round(duration, 3) # round it up at 3rd decimal point as the above three lines creates small number below the 3rd decimal point return duration
[docs]def Multiprocess_whitening_for_burn_in(full_timeseries, target_timeseries_start, target_timeseries_end, array_dummy_duration): ''' description: This is used for normalizing the spectrum of each trial in the on-source window of random time stamps created with FindBGlistBurnIn() iterate each trial for each dummy on 1. from the time series of a single channel 2. calculate how many trials are available per dummy on-source window within the extended on-source window 3. iterate trials per dummy on-source window USAGE: list_whitened_fft, sample_rate, DURATION, list_num_trial_used = Multiprocess_whitening_for_burn_in(full_timeseries, target_timeseries_start, target_timeseries_end, array_dummy_duration) :param full_timeseries: time series comprising target and BGs :param target_timeseries_start: a start time of a target segment :param target_timeseries_end: an end time of a target segment :param array_dummy_duration: numpy array of dummy on-source windows :return: list_whitened_fft: list of numpy arrays of the nomalized spectrum where each element of this list is the normalized spectrum for each trial with a given dummy on-source window. These spectrums are concatenated to a vector from left to right, e.g, np.array([sp0_try0, sp1_try0, ..., sp0_try1, sp0_try1, .... ]) Hence, this list is [ (sp for dummy 0), (sp for dummy 1), (....), ....] sample_rate: sampling rate of this channel DURATION: a duration of a target segment list_num_trial_used: a list of trials per dummy on-source window. Note the number of trials per dummy on-source vary becuase of the limited length of the extended total on-source window. The longer the dummy on-source, the fewer the trials are available ''' sample_rate = full_timeseries.sample_rate.value DURATION = target_timeseries_end - target_timeseries_start fourth_decimal_num_of_duration = (DURATION * 1000 - floor(DURATION * 1000)) * 10 # fourth decimal number DURATION = DURATION - fourth_decimal_num_of_duration * 0.0001 # truncate at the fourth decimal number # revise the duration to make an error in whitening process not not to occur # ValueError occurs in the whitening when the lengths of whitened data, whitening data and the filter are different third_decimal_num_of_duration = round((round(DURATION, 3) - round(DURATION, 2)) * 1000) if third_decimal_num_of_duration % 2 == 1: # odd number at the third decimal point DURATION = DURATION - 0.001 # make the duration with an even number at the third decimal point elif third_decimal_num_of_duration % 2 == 0: # even number at the third decimal point pass # keep the original duration as it is DURATION = round(DURATION, 3) # round it up at 3rd decimal point as the above three lines creates small number below the 3rd decimal point # get the on-source time series on_source = full_timeseries.crop(start=target_timeseries_start, end=target_timeseries_end) on_source_start = on_source.span[0] on_source_end = on_source.span[1] full_timeseries_start = full_timeseries.span[0] full_timeseries_end = full_timeseries.span[1] # average background ASD length_for_ASD = 128 # in sec timeseries_for_asd = full_timeseries.crop(end=full_timeseries_start+length_for_ASD) list_whitened_fft = [] list_num_trial_used = [] for dummy_duration in array_dummy_duration: # iterate dummy on-source window dummy_duration = clean_duration_for_asd(dummy_duration) # number of BG trials iterate = int(DURATION / dummy_duration) ## --------cropping from the starting time------ # take some fraction of trials list_index_trials = select_some_trials(iterate, maximum_iterate=10) crop_start_list = [on_source_start + i * dummy_duration for i in list_index_trials] # a list of starting times cropped segments crop_end_list = [crop_start + dummy_duration for crop_start in crop_start_list] # a list of ending times of cropped segments # avoid the cropping warning crop_end_list = [crop_end if crop_end < on_source_end else on_source_end for crop_end in crop_end_list] # if end time is replaced with the end of the BG segments if it is larger # note: fft_PBG = pre_background.crop(start=crop_start, end=crop_end).average_fft(DURATION, window='hanning').abs().value # crop and fft of it ave_background_asd = timeseries_for_asd.asd(dummy_duration, 0.5 * dummy_duration, method='median').value if (timeseries_for_asd.value == 0).all() is True: # if the time series is all zero whitened_fft_ind_list = [np.zeros(ave_background_asd.shape[0]) for crop_start, crop_end in zip(crop_start_list, crop_end_list)] elif ave_background_asd[ave_background_asd != 0].shape[0] == 0: # if the time series is all zero whitened_fft_ind_list = [np.zeros(ave_background_asd.shape[0]) for crop_start, crop_end in zip(crop_start_list, crop_end_list)] else: # replace value of elements equal to zero with non-zero nearest neighbor ave_background_asd_new = ave_background_asd.copy() zero_indecies = np.where(ave_background_asd == 0)[0] nonzero_indecies = np.where(ave_background_asd != 0)[0] if zero_indecies.shape[0] == 0: # if everything is non-zero pass # replace value of elements equal to zero with non-zero nearest neighbor else: # if some of them are zero for zero_ind in zero_indecies: # # nearest index with non-zero value k = np.argsort(np.abs((zero_ind - nonzero_indecies)))[0] ave_background_asd_new[zero_ind] = ave_background_asd[nonzero_indecies[k]] # calculate each whitened FFT whitened_fft_ind_list = [on_source.crop(start=crop_start, end=crop_end).asd(dummy_duration, 0.5 * dummy_duration, method='median').value / ave_background_asd_new for crop_start, crop_end in zip(crop_start_list, crop_end_list)] del crop_start_list, crop_end_list # delete redundant lists whitened_fft = np.concatenate(whitened_fft_ind_list, axis=None) del whitened_fft_ind_list # delete redundant a list # fill list_whitened_fft.append(whitened_fft) # number of trials used num_trial_used = len(list_index_trials) # fill list_num_trial_used.append(num_trial_used) return list_whitened_fft, sample_rate, DURATION, list_num_trial_used
[docs]def SaveDummyTargetHDF5_OFFLINE_burn_in(Listsegments, re_sfchs, IFO, outputpath, outputfilename, number_process, PlusHOFT='False', trial_duration_sample=20): ''' description: Save the normalized spectrum for every channels for every glitches, with the use of Multiprocess_whitening_for_burn_in() 1. iterate all the samples 2. whiten the on-source window for every channels 4. save the whitened spectrum as a HDF5 file each group in the HDF5 is for a single channel and each of datasets per group has the normalized spectrum for a dummy on-source window USAGE: SaveTargetAndBackGroundHDF5_OFFLINE_burn_in(Listsegments, re_sfchs, IFO, outputpath, outputfilename, number_process, PlusHOFT='False') :param Listsegments: a list of segment parameters :param re_sfchs: a list of safe channels :param outputpath: a directory of an output file :param outputfilename: a name of an output file :param number_process: a number of processes in parallel :param PlusHOFT: whether to get data of hoft, {'True' or 'False'}, 'False' in default :param trial_duration_sample: a number of dummy on-source :return None ''' # check if the output directory exists if os.path.isdir(outputpath) is False: print("The directory for the output HDF5 file does not exist: path = {}".format(outputpath)) print("Please specify the existing directory for it.") sys.exit() # the list of the safe channels channels = ['{0}:{1}'.format(IFO, ch) for ch in re_sfchs] # the path and the file name outputPathFileName = os.path.join(outputpath, outputfilename) # make an output file try: f = h5py.File(outputPathFileName, 'w-') f.close() except IOError: print('The file already exists in {}'.format(outputPathFileName)) print('Please choose a different file name in a configuration file or delete this file if you want to overwrite it') sys.exit() # remove the astro cashe if os.path.isfile('~/.astro/cashe/downlowd/py3/lock') is True: os.remove('~/.astro/cashe/downlowd/py3/lock') # open a HDF5 file in writing mode with h5py.File(outputPathFileName, 'r+') as f: for IndexSatisfied, target_timeseries_start, target_timeseries_end, pre_background_start, pre_background_end, fol_background_start, fol_background_end, gpstime, duration, SNR, confi, ID in Listsegments: t0 = time.time() try: # deal with time series in bad state if pre_background_start != 0 and pre_background_end != 0 and fol_background_start != 0 and fol_background_end != 0: # if there is an available proceeding background and if there is an available following background full_timeseriesDict = TimeSeriesDict.find(channels, pre_background_start, fol_background_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process) elif pre_background_start != 0 and pre_background_end != 0 and fol_background_start == 0 and fol_background_end == 0: # if there is an available proceeding background and if there is NO available following background full_timeseriesDict = TimeSeriesDict.find(channels, pre_background_start, target_timeseries_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process) elif pre_background_start == 0 and pre_background_end == 0 and fol_background_start != 0 and fol_background_end != 0: # if there is NO available proceeding background and if there is an available following background full_timeseriesDict = TimeSeriesDict.find(channels, target_timeseries_start, fol_background_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process) else: pass channels_used = channels if PlusHOFT == 'True': # get the data of hoft if pre_background_start != 0 and pre_background_end != 0 and fol_background_start != 0 and fol_background_end != 0: # if there is an available proceeding background and if there is an available following background full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], pre_background_start, fol_background_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process) elif pre_background_start != 0 and pre_background_end != 0 and fol_background_start == 0 and fol_background_end == 0: # if there is an available proceeding background and if there is NO available following background full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], pre_background_start, target_timeseries_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process) elif pre_background_start == 0 and pre_background_end == 0 and fol_background_start != 0 and fol_background_end != 0: # if there is NO available proceeding background and if there is an available following background full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], target_timeseries_start, fol_background_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process) else: pass full_timeseriesDict.append(full_timeseriesDict_hoft) # append del full_timeseriesDict_hoft # delete the hoft object channels_used = np.append(channels, '{}:GDS-CALIB_STRAIN'.format(IFO)) # ===================================================== except IndexError: print('Bad data, go to a next GPS') continue # go to a next GPS time except RuntimeError: print('RuntimeError, go to a next GPS') continue except IOError: # error when getting time series print('IOError, go to a next GPS') continue except ValueError: # ValueError: Failed to read /hdfs/frames/O3/raw/H1/H-H1_R-12384/H-H1_R-1238489600-64.gwf: Unable to map from the stream class id (0) to internal class id at position: 18446744073709551601 [streamref: class: 0 instance: 0 length: 0 ] print('ValueError, go to a next GPS') continue g = f.create_group('gps{:05d}'.format(IndexSatisfied)) g.attrs['GPS'] = gpstime # a peak GPS time g.attrs['duration'] = duration g.attrs['snr'] = SNR # snr g.attrs['confidence'] = confi # confidence g.attrs['id'] = ID g.attrs['ifo'] = channels[0].split(':')[0] # total on-source window on_source_duration = target_timeseries_end - target_timeseries_start # duration on trial on-source window #------------------------------------------------------------+ # randomly choose dummy on-source windows uniform in log10 | #------------------------------------------------------------+ duration_min = 0.02 # the minimum duration is set to 0.02 s. This is ok as in this program analysis glitches with duration gerater than 0.02 s duration_max = 15 if duration_max > on_source_duration: duration_max = on_source_duration array_dummy_duration = np.round(10 ** np.random.uniform(np.log10(duration_min), np.log10(duration_max), trial_duration_sample), 3) try: ch_labels = range(len(channels_used)) # a list of channel labels list_array_dummy_duration = [] List_list_whitened_fft = [] # a list of whitened fft of target List_sample_rate = [] # a list of sample rates List_DURATION = [] list_list_num_trial_used = [] # whitened target and BG segments with multi processes with concurrent.futures.ProcessPoolExecutor(max_workers=number_process) as executor: for list_whitened_fft, sample_rate, DURATION, list_num_trial_used in executor.map(Multiprocess_whitening_for_burn_in, full_timeseriesDict.values(), itrtls.repeat(target_timeseries_start), itrtls.repeat(target_timeseries_end), itrtls.repeat(array_dummy_duration) ): List_list_whitened_fft.append(list_whitened_fft) # fill List_sample_rate.append(sample_rate) # fill List_DURATION.append(DURATION) # fill list_list_num_trial_used.append(list_num_trial_used) # fill for ch_label in ch_labels: # iterate through channel labels sub_g = g.create_group("ch{:05d}".format(ch_label)) sub_g.attrs['SamplingRate'] = List_sample_rate[ch_label] # target sub_g.attrs['channel'] = channels_used[ch_label].split(':')[1] # target for dummy_duration_index in range(len(array_dummy_duration)): dset = sub_g.create_dataset("dummy{:05d}".format(dummy_duration_index), data=List_list_whitened_fft[ch_label][dummy_duration_index], compression="gzip", compression_opts=9) # target dset.attrs['SamplingRate'] = List_sample_rate[ch_label] # target dset.attrs['duration'] = array_dummy_duration[dummy_duration_index] # target dset.attrs['num_trial'] = list_list_num_trial_used[ch_label][dummy_duration_index] # target except ValueError: del f['gps{:05d}'.format(IndexSatisfied)] print('*** delete group gps{:05d} due to ASD estimation error ***'.format(IndexSatisfied)) continue # go to a next GPS time except IOError: # IOError: Driver write request failed del f['gps{:05d}'.format(IndexSatisfied)] print('*** delete group gps{:05d} due to memory error ***'.format(IndexSatisfied)) continue # go to a next GPS time print('Complete saving a sample as gps{:05d}'.format(IndexSatisfied)) t1 = time.time() print('Time taken: {} s'.format(t1 - t0))
# ====== save for multi-frequency bands search =====
[docs]def BG_upper_threshold_single_channel_given_freqband(list_dummy_duration, list_whitened_fft, sampling_rate, list_num_trial_used, sigma, LowerCutOffFreq='None', UpperCutOffFreq='None'): ''' description: For a single channel per glitch, this function calculates a list of the background upper threshold across dummy on-source windows USAGE: list_bg_upper_threshold = BG_upper_threshold_single_channel_given_freqband(list_dummy_duration, list_whitened_fft, sampling_rate, list_num_trial_used, sigma, LowerCutOffFreq='None', UpperCutOffFreq='None') :param list_dummy_duration: a list of dummy on-source window :param list_whitened_fft: a list of the normalized spectrums where each element is a spectrum for a given dummy on-source window :param sampling_rate: sampling rate of a channel :param list_num_trial_used: a list of trials of dummy on-source window within the total on-source window :param sigma: an integer to determine the upper bound of the off-source window :param LowerCutOffFreq: a lower cutoff frequency :param UpperCutOffFreq: an upper cutoff frequency :return: Count: importance ''' list_bg_upper_threshold = np.array([]) for dummy_duration_index, dummy_duration in enumerate(list_dummy_duration, 0): sp = list_whitened_fft[dummy_duration_index] num_trial_used = list_num_trial_used[dummy_duration_index] try: # test if it is an array sp[0] except ValueError: # if it is None sp = 0 # assign an integer #======================= if type(sp) != int: sp = sp.reshape(int(num_trial_used), int(len(sp)/num_trial_used)) # reshape it else: sp = np.array([[0]]) # a trick to be used in the line, if np.isnan(sp[0]) != True : #======================= #++++++ for the case where there is no data measured at a channel +++++++++++++++++++ #++++++ That channel has no contribution to h(t) so that we can define Count = 0 +++++++++++ #++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ if np.isnan(sp[:,0]).any() != True : # if sp has data measured # low, high, or band pass filter if we know which frequency band is important if LowerCutOffFreq != 'None' and UpperCutOffFreq == 'None': # high-pass filter ## pT, pP, and pF should be zero if LowerCutOffFreq is greater than the half of the sampling rate as there is no data. However pT would be non-zero due to the bad resolution (a very short giltch) if sampling_rate / 2 <= LowerCutOffFreq: sp = np.array([[0]]) else: LowerCutOffElement = int(LowerCutOffFreq * dummy_duration) if LowerCutOffElement <= sp.shape[-1]: # if LowCutOffElement is within the length of the vector sp = sp[:, LowerCutOffElement:] else: # if LowerCutOffElement is greater than the length of the vector sp = np.array([[0]]) elif LowerCutOffFreq == 'None' and UpperCutOffFreq != 'None': # low-pass filter # even if UpperCutOffElement is greater than the length of the vector is works fine UpperCutOffElement = int(UpperCutOffFreq * dummy_duration) sp = sp[:, :UpperCutOffElement] elif LowerCutOffFreq != 'None' and UpperCutOffFreq != 'None': # band pass filter ## pT, pP, and pF should be zero if LowerCutOffFreq is greater than the half of the sampling rate as there is no data. However pT would be non-zero due to the bad resolution (a very short giltch) if sampling_rate / 2 <= LowerCutOffFreq: sp = np.array([[0]]) else: LowerCutOffElement = int(LowerCutOffFreq * dummy_duration) UpperCutOffElement = int(UpperCutOffFreq * dummy_duration) if LowerCutOffElement <= sp.shape[-1]: # if LowCutOffElement is within the length of the vector sp = sp[:, LowerCutOffElement:UpperCutOffElement] else: # if LowCutOffElement is greater than the length of the vector sp = np.array([[0]]) else: pass if sp[0,0] == 0: GausCeiling = 0 else: GausCeiling = sp.mean() + sigma * sp.std(ddof=1) # gaussian noise ceiling else: GausCeiling = 0 # manually put zero because this channel has no contribution at this glitch ID list_bg_upper_threshold = np.append(list_bg_upper_threshold, GausCeiling) return list_bg_upper_threshold
[docs]def BG_upper_threshold_single_channel_multiband(list_dummy_duration, list_whitened_fft, sampling_rate, list_num_trial_used, sigma): ''' description: calculate values of the background upper threshold per dummy on-source window and per frequency band USAGE: MatBGUpperThresh = BG_upper_threshold_single_channel_multiband(list_dummy_duration, list_whitened_fft, sampling_rate, list_num_trial_used, sigma) :param list_dummy_duration: a list of dummy on-source window :param list_whitened_fft: a list of the normalized spectrums where each element is a spectrum for a given dummy on-source window :param sampling_rate: sampling rate of a channel :param list_num_trial_used: a list of trials of dummy on-source window within the total on-source window :param sigma: an integer to determine the upper bound of the off-source window :return: MatBGUpperThresh: values of the background upper threshold, numpy array, where the frequency bands are rows from the top to bottom, the dummy on-source windows are in columns from left to right ''' # iterate different frequency bands in the list MatBGUpperThresh = np.array([]) for lower_freq_cut, upper_freq_cut in Const.freq_bands: list_bg_upper_threshold = BG_upper_threshold_single_channel_given_freqband(list_dummy_duration, list_whitened_fft, sampling_rate, list_num_trial_used, sigma, LowerCutOffFreq=lower_freq_cut, UpperCutOffFreq=upper_freq_cut) list_bg_upper_threshold = list_bg_upper_threshold.reshape(1, list_bg_upper_threshold.shape[-1]) MatBGUpperThresh = np.vstack([MatBGUpperThresh, list_bg_upper_threshold]) if MatBGUpperThresh.size else list_bg_upper_threshold return MatBGUpperThresh
[docs]def CreateBGUpperThresh_single_channel_multiband(full_timeseries, target_timeseries_start, target_timeseries_end, array_dummy_duration, sigma): ''' discription: 1. make list_whitened_fft: list of numpy arrays of the nomalized spectrum where each element of this list is the normalized spectrum for each trial with a given dummy on-source window. These spectrums are concatenated to a vector from left to right, e.g, np.array([sp0_try0, sp1_try0, ..., sp0_try1, sp0_try1, .... ]) Hence, this list is [ (sp for dummy 0), (sp for dummy 1), (....), ....] sample_rate: sampling rate of this channel DURATION: a duration of a target segment list_num_trial_used: a list of trials per dummy on-source window. Note the number of trials per dummy on-source vary becuase of the limited length of the extended total on-source window. The longer the dummy on-source, the fewer the trials are available 2. calculate values of the background upper threshold per dummy on-source window and per frequency band USAGE: MatBGUpperThresh, sample_rate = CreateBGUpperThresh_single_channel_multiband(full_timeseries, target_timeseries_start, target_timeseries_end, array_dummy_duration, sigma) :param full_timeseries: the full time series in gwpy object including on- and off source windows :param target_timeseries_start: the start time of the on-source window :param target_timeseries_end: the end time of the on-source window :param array_dummy_duration: numpy array of dummy on-source windows :param sigma: an interger to calculate the value of importance :return: MatBGUpperThresh: values of the background upper threshold, numpy array, where the frequency bands are rows from the top to bottom, the dummy on-source windows are in columns from left to right sample_rate: a sampling rate of a single channel ''' # calculate the whitened FFT of the on- and off-source windows list_whitened_fft, sample_rate, DURATION, list_num_trial_used = Multiprocess_whitening_for_burn_in(full_timeseries, target_timeseries_start, target_timeseries_end, array_dummy_duration) # calculate the importance MatBGUpperThresh = BG_upper_threshold_single_channel_multiband(array_dummy_duration, list_whitened_fft, sample_rate, list_num_trial_used, sigma) return MatBGUpperThresh, sample_rate
[docs]def CreateAllChannels_BGUpperThresh_multband(Listsegment, IFO, re_sfchs, number_process, PlusHOFT, sigma, duration_max, trial_duration_sample): ''' description: 1. use a single glitch time 2. query timeseries of all the channels around a glitch 3. calculate values of the background upper threshold per dummy on-source window per frequency band 4. iterate through channels USAGE: IndexSatisfied, list_mat_BG_upper_thresh, array_dummy_duration, list_sample_rates, re_sfchs, gpstime, duration, SNR, confi, ID = CreateAllChannels_BGUpperThresh_multband(Listsegment, IFO, re_sfchs, number_process, PlusHOFT, sigma, duration_max = 15, trial_duration_sample = 20) :param Listsegment: a list of segment parameters :param IFO :param re_sfchs: a list of safe channels :param number_process: a number of processes in parallel :param PlusHOFT: whether to get data of hoft, {'True' or 'False'} :param sigma: an integer to be used for calculating values of importance :param maximum value of dummy on-sourc window in sec :param a number of dummy on-source windows within the total on-source window :return: IndexSatisfied: glitch index list_mat_BG_upper_thresh: a list of matrices per channel where element of a matrix is a value of the background upper threshold, numpy array, where the frequency bands are rows from the top to bottom, the dummy on-source windows are in columns from left to right array_dummy_duration: numpy array of the dummy on-source windows list_sample_rates: a list of sampling rates of channels, numpy array re_sfchs: list of channels without "IFO:" at the beginning gpstime: a gps time duration: a value of duration SNR: signal to noise ratio in the h(t) conf: a confidence level of classification of a glitch, provided by Gravity Spy. Otherwise None ID: a glitch ID, provided by Gravity Spy in usual ''' channels = ['{0}:{1}'.format(IFO, ch) for ch in re_sfchs] IndexSatisfied, target_timeseries_start, target_timeseries_end, pre_background_start, pre_background_end, fol_background_start, fol_background_end, gpstime, duration, SNR, confi, ID = Listsegment t0 = time.time() # extracting the target data and the proceeding and following backgrounds try: # deal with time series in bad state if pre_background_start != 0 and pre_background_end != 0 and fol_background_start != 0 and fol_background_end != 0: # if there is an available proceeding background and if there is an available following background full_timeseriesDict = TimeSeriesDict.find(channels, pre_background_start, fol_background_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process) elif pre_background_start != 0 and pre_background_end != 0 and fol_background_start == 0 and fol_background_end == 0: # if there is an available proceeding background and if there is NO available following background full_timeseriesDict = TimeSeriesDict.find(channels, pre_background_start, target_timeseries_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process) elif pre_background_start == 0 and pre_background_end == 0 and fol_background_start != 0 and fol_background_end != 0: # if there is NO available proceeding background and if there is an available following background full_timeseriesDict = TimeSeriesDict.find(channels, target_timeseries_start, fol_background_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process) else: pass if PlusHOFT == 'True': # get the data of hoft if pre_background_start != 0 and pre_background_end != 0 and fol_background_start != 0 and fol_background_end != 0: # if there is an available proceeding background and if there is an available following background full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], pre_background_start, fol_background_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process) elif pre_background_start != 0 and pre_background_end != 0 and fol_background_start == 0 and fol_background_end == 0: # if there is an available proceeding background and if there is NO available following background full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], pre_background_start, target_timeseries_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process) elif pre_background_start == 0 and pre_background_end == 0 and fol_background_start != 0 and fol_background_end != 0: # if there is NO available proceeding background and if there is an available following background full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], target_timeseries_start, fol_background_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process) else: pass full_timeseriesDict.append(full_timeseriesDict_hoft) # append del full_timeseriesDict_hoft # delete the hoft object re_sfchs = np.append(re_sfchs, 'GDS-CALIB_STRAIN') except IndexError as err: print("IndexError: {}".format(err.message)) # IndexError: After reading object of type: FrAdcData stream is no longer in good state. ( Frmae: 0 ChannelType: L1:ASC-X_TR_A_NSUM_OUT_DQ offset: 932092099) return 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 except RuntimeError as err: print("RuntimeError: {}".format(err.message)) return 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 except IOError as err: # error when getting time series print('IOError: {}'.format(err.message)) return 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 except ValueError: # ValueError: Failed to read /hdfs/frames/O3/raw/H1/H-H1_R-12384/H-H1_R-1238489600-64.gwf: Unable to map from the stream class id (0) to internal class id at position: 18446744073709551601 [streamref: class: 0 instance: 0 length: 0 ] print('ValueError') return 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 try: # total on-source window on_source_duration = target_timeseries_end - target_timeseries_start # duration on trial on-source window # -------------------------------------------+ # randomly choose dummy on-source windows | # -------------------------------------------+ duration_min = 0.02 # the minimum duration is set to 0.02 s. This is ok as in this program analysis glitches with duration gerater than 0.02 s if duration_max > on_source_duration: duration_max = on_source_duration array_dummy_duration = np.round(10 ** np.random.uniform(np.log10(duration_min), np.log10(duration_max), trial_duration_sample), 3) list_mat_BG_upper_thresh = [] # a list of rho list_sample_rates = np.array([]) # whitened target and BG segments with multi processes with concurrent.futures.ProcessPoolExecutor(max_workers=number_process) as executor: for MatBGUpperThresh, sample_rate in executor.map(CreateBGUpperThresh_single_channel_multiband, full_timeseriesDict.values(), itrtls.repeat(target_timeseries_start), itrtls.repeat(target_timeseries_end), itrtls.repeat(array_dummy_duration), itrtls.repeat(sigma) ): # fill list_mat_BG_upper_thresh.append(MatBGUpperThresh) list_sample_rates = np.append(list_sample_rates, sample_rate) except ValueError: print('*** delete group gps{:05d} due to ASD estimation error ***'.format(IndexSatisfied)) return 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 except IOError: # IOError: Driver write request failed print('*** delete group gps{:05d} due to memory error ***'.format(IndexSatisfied)) return 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 t1 = time.time() print('Time taken: {} s'.format(t1 - t0)) return IndexSatisfied, list_mat_BG_upper_thresh, array_dummy_duration, list_sample_rates, re_sfchs, gpstime, duration, SNR, confi, ID # pass the parameters
[docs]def SaveUppperThreshodBG_multiband_OFFLINE_burn_in(Listsegments, re_sfchs, IFO, outputpath, outputfilename, number_process, sigma, PlusHOFT='False', duration_max=15, trial_duration_sample=20): ''' description: THIS IS USED FOR "OFFLINE" MODE 1. iterate glitch samples 2. get values of the background upper threshold per dummy on-source window per frequency band for each of channels 4. save the values Note this depends on USAGE: SaveUppperThreshodBG_multiband_OFFLINE(Listsegments, re_sfchs, IFO, outputpath, outputfilename, number_process, sigma, PlusHOFT='False', duration_max = 15, trial_duration_sample = 20) :param Listsegments: a list of segment parameters :param re_sfchs: a list of safe channels :param outputpath: a directory of an output file :param outputfilename: a name of an output file :param number_process: a number of processes in parallel :param PlusHOFT: whether to get data of hoft, {'True' or 'False'}, 'False' in default :param sigma: an integer to determine the upper bound of the off-source window :param duration_max: a maximum value of length of dummy on-source window in sec :param trial_duration_sample: a number of dummy on-source window within the total on-source window :return None ''' # the path and the file name outputPathFileName = os.path.join(outputpath, outputfilename) # make an output file try: f = h5py.File(outputPathFileName, 'w-') f.close() except IOError: print('The file already exists in {}'.format(outputPathFileName)) print('Please choose a different file name in the configuration file or delete this file if you want to overwrite it') sys.exit() # remove the astro cashe if os.path.isfile('~/.astro/cashe/downlowd/py3/lock') is True: os.remove('~/.astro/cashe/downlowd/py3/lock') # open a HDF5 file in writing mode with h5py.File(outputPathFileName, 'r+') as f: for Listsegment in Listsegments: IndexSatisfied, list_mat_BG_upper_thresh, array_dummy_duration, list_sample_rates, re_sfchs_used, gpstime, duration, SNR, confi, ID = CreateAllChannels_BGUpperThresh_multband(Listsegment, IFO, re_sfchs, number_process, PlusHOFT, sigma, duration_max, trial_duration_sample) if type(duration) == int: # if some error occurs, List_rho is 0, go to the next iteration continue else: g = f.create_group('gps{:05d}'.format(IndexSatisfied)) g.attrs['GPS'] = gpstime # a peak GPS time g.attrs['duration'] = duration g.attrs['snr'] = SNR # snr g.attrs['confidence'] = confi # confidence g.attrs['id'] = ID g.attrs['ifo'] = IFO g.create_dataset("dummy_durations", data=array_dummy_duration, compression="gzip", compression_opts=9) # matrix of frequency bnads freq_bands = np.array(Const.freq_bands).tolist() for i in range(len(freq_bands)): for j in range(len(freq_bands[0])): freq_bands[i][j] = freq_bands[i][j].encode('utf8') g.create_dataset("freq_bands", data=freq_bands, compression="gzip", compression_opts=9) # matrix of frequency bnads for ch_label in range(len(re_sfchs_used)): # iterate through channel labels dset = g.create_dataset("ch{:05d}".format(ch_label), data=list_mat_BG_upper_thresh[ch_label], compression="gzip", compression_opts=9) dset.attrs['SamplingRate'] = list_sample_rates[ch_label] # target dset.attrs['channel'] = re_sfchs_used[ch_label].encode('utf8') # target
#-------------------------- # post-process for burn-in #--------------------------
[docs]def make_bg_up_thesh_interpolate_hdf5(input_dir, input_hdf5_file): ''' description: make a dictionry of values of the background upper threshold across dummy on-source window and frequency bands per channel USAGE: list_all_dummy_duration_sorted, mat_ch_sorted_dict = make_bg_up_thesh_interpolate_hdf5(input_dir, input_hdf5_file) :param input_dir: input directory :param input_hdf5_file: name of a HDF5 file :return: list_all_dummy_duration_sorted: ascending list of dummy on-source windows mat_ch_sorted_dict: a dictionary in which each of the key contains array of values the he background upper threshold per dummy on-source window per frequency band where frequency bands are in row from top to bottom and dummy on-source windows are in columns from left to right ''' # path to a HDF5 file which contains all the values of the background upper threshold across channels and frequency bands path_input_hdf5 = os.path.join(input_dir, input_hdf5_file) if os.path.isfile(path_input_hdf5) is False: print("No input background upper threshold h5py file called {}".format(path_input_hdf5)) print("Exit this program") sys.exit() # open the input hdf5 file with reading mode f = h5py.File(path_input_hdf5, 'r') # list of glitches list_g = list(f.keys()) # --------------------------+ # get the list of channels | # --------------------------+ list_channels = np.array([]) g = f[list_g[0]] for dset_ind in g.keys(): if dset_ind[:2] != "ch": continue channel = g[dset_ind].attrs["channel"].decode('utf8') list_channels = np.append(list_channels, channel) #-------------------------+ # get the frequency bands | #-------------------------+ list_freq_bands = g['freq_bands'][()] #--------------------------------------------------------------------------------------------------------------+ # get the background upper thresholds for all the glitches and all the channels across all the frequency bands | #--------------------------------------------------------------------------------------------------------------+ mat_ch_dict = {} # keys are channels list_all_dummy_duration = np.array([]) # list of all the dummy durations # several dummy durations are chosen per glitch for gps_ind in f.keys(): # iterate through glitches g = f[gps_ind] list_dummy_duration = g['dummy_durations'][()] list_all_dummy_duration = np.append(list_all_dummy_duration, list_dummy_duration) for ch_ind in range(len(list_channels)): # iterate through channels # extract the matrix of the back ground upper threshold per channel per glitch # each element in this matrix is the BG upper threshold value for a given frequency band and a given dummy duration # frequency bands are listed in row from 0 to 7, for example # dummy durations are in columns bg_up_mat = g["ch{:05d}".format(ch_ind)][()] if gps_ind == list_g[0]: # if the iteration is the first glitch mat_ch_dict[list_channels[ch_ind]] = bg_up_mat # fill the matrix to the dictionary else: # otherwise # stack to the previous matrix mat_ch_dict[list_channels[ch_ind]] = np.hstack([mat_ch_dict[list_channels[ch_ind]], bg_up_mat]) if mat_ch_dict[list_channels[ch_ind]].size else bg_up_mat #-----------------------------------------------------------------------+ # short the upper threshold ascendingly in the order of dummy durations | #-----------------------------------------------------------------------+ sorted_index = np.argsort(list_all_dummy_duration) # sort index ascending list_all_dummy_duration_sorted = list_all_dummy_duration[sorted_index] # sorted dummy durations mat_ch_sorted_dict = {} for channel in mat_ch_dict.keys(): # iterate through channels mat_ch_sorted_dict[channel] = mat_ch_dict[channel][:, sorted_index] # sort the values and fill print("Finished extracting values of the background upper threshold") return list_all_dummy_duration_sorted, mat_ch_sorted_dict
[docs]def save_interpolate_bg_upper_thres_hdf5(list_all_dummy_duration_sorted, mat_ch_sorted_dict, output_dir, output_hdf5_file, med_abs_sigma=6, poly_degree=10): ''' description: 1. fit the polynomial function against the background upper threshold as a function of dummy on-source window per freq band per channel 2. save the polynomial parametes to a HDF5 file USAGE: save_interpolate_bg_upper_thres_hdf5(list_all_dummy_duration_sorted, mat_ch_sorted_dict, output_dir, output_hdf5_file, med_abs_sigma=6, poly_degree=10) :param list_all_dummy_duration_sorted: ascending list of dummy on-source windows :param mat_ch_sorted_dict: a dictionary in which each of the key contains array of values the he background upper threshold per dummy on-source window per frequency band where frequency bands are in row from top to bottom and dummy on-source windows are in columns from left to right :param output_dir: output directory :param output_hdf5_file: otuput file name :param med_abs_sigma: a interger number of median absolute error to remove outliers for fitting :param poly_degree: polynomial degree :return: None ''' path_output_hdf5 = os.path.join(output_dir, output_hdf5_file) try: f = h5py.File(path_output_hdf5, 'w-') f.close() except IOError: print('The file already exists in {}'.format(path_output_hdf5)) os.system('rm {}'.format(path_output_hdf5)) # open a HDF5 file in writing mode with h5py.File(path_output_hdf5, 'a') as f: for channel in mat_ch_sorted_dict.keys(): # iterate through channels print(channel) g = f.create_group(channel) bg_up_mat = mat_ch_sorted_dict[channel] for band_ind in range(bg_up_mat.shape[0]): # iterate through frequency bandas print('freq band {}'.format(band_ind)) # remove nan samples bg_up_bd_nonnan_ind = np.where(np.isnan(bg_up_mat[band_ind, :]) == False)[0] sample_x = list_all_dummy_duration_sorted[bg_up_bd_nonnan_ind] sample_y = bg_up_mat[band_ind, :][bg_up_bd_nonnan_ind] # remove extreme large outliers median = np.median(sample_y) med_abs_dev = stats.median_absolute_deviation(sample_y) if med_abs_dev == 0: # if all the values are zero, med_abs_dev will be zero med_abs_dev = 1 non_ext_outliner_ind = \ np.where((median - med_abs_sigma * med_abs_dev < sample_y) & (sample_y < median + med_abs_sigma * med_abs_dev))[0] sample_x = sample_x[non_ext_outliner_ind] sample_y = sample_y[non_ext_outliner_ind] # fit with a polynomial z = np.polyfit(sample_x, sample_y, poly_degree) dset = g.create_dataset("BP{band_ind:d}".format(band_ind=int(band_ind)), data=z, compression="gzip", compression_opts=9) dset.attrs['channel'] = channel.encode('utf8') dset.attrs['freqband'] = int(band_ind) print("Finished extrapolate values of the background upper threshold") print("Saved as {}".format(path_output_hdf5)) return None
#--------------- # for target sample #---------------
[docs]def FindglitchlistextendBG(df, Epoch_lt, TargetGlitchClass, IFO, BGSNR_thre, targetSNR_thre, Confidence_thre, UpperDurationThresh, LowerDurationThresh, UserDefinedDuration, gap, position_duration_bfr_centr, TriggerPeakFreqLowerCutoff=0, TriggerPeakFreqUpperCutoff=8192, targetUpperSNR_thre=np.inf): ''' description: 1. glitch data set (.csv file) 2. get the data about the target glitch class 3. get the subset of the target glitches based on SNR and confidence level threshold a user defines 4. the preceeding and following BGs are 64 sec-long for every samples regardless the overlapping with any other glitches 5. return the info of the accepted glitches USAGE: Listsegments = FindglitchlistextendBG(df, Epochstart, Epochend, Commissioning_lt, TargetGlitchClass, IFO, BGSNR_thre, targetSNR_thre, Confidence_thre, UpperDurationThresh, LowerDurationThresh, UserDefinedDuration, gap, position_duration_bfr_centr, TriggerPeakFreqLowerCutoff=0, TriggerPeakFreqUpperCutoff=8192, targetUpperSNR_thre=np.inf) :param df: GravitySpy meta data in pandas format :param Epochstart: starting time of an epoch :param Epochend: end time of an epoch :param Commissioning_lt: commissioning time in list :param TargetGlitchClass: a target glitch class name (str) :param IFO: a type of interferometer (H1, L1, V1) (str) :param BGSNR_thre: an upper threhold of SNR for background glitches, i.e., quiet enough ), float or int :param targetSNR_thre: a lower threshold of SNR for target glitches, float or int :param Confidence_thre: a threshold of confidence level (float or int ) :param UpperDurationThresh: an upper bound of duration in sec (float or int) :param LowerDurationThresh: a lower bound of duration in sec (float or int) :param UserDefinedDuration: user defined duration of a glitch (float or int), 0 in default :param gap: a time gap between the target and the background segments in sec, 1 sec in default :param position_duration_bfr_centr: proportion of duration for a target segment around a center time, e.g, 0.5 indicates duration is even distributed around a center time, 0.83 indicates 5/6 is before the center time :param TriggerPeakFreqLowerCutoff: a lower limit cutoff value of the peak frequency of triggers given by an ETG for target glitches :param TriggerPeakFreqUpperCutoff: an upper limit cutoff value of the peak frequency of triggers given by an ETG queries for target glitches :param targetUpperSNR_thre: an upper limit cutoff value of SNR of triggers given by an ETG queries for target glitches :param flag: 'Both' or 'Either' taking both backgronds or either the preceding or the following background, respectively to accept glitches :return: the list of parameters of glitches passing the above thresholds Listsegments contains of ListIndexSatisfied: a list of index of glitches Listtarget_timeseries_start: a list of a target glitch starting time Listtarget_timeseries_end: a list of a target glitch ending time Listpre_background_start: a list of preceding background starting time Listpre_background_end; a list of preceding background ending time Listfol_background_start: a list of following background starting time Listfol_background_end: a list of following background ending time Listgpstime: a list of GPS times Listduration: a list of durations ListSNR: a list of SNRs Listconfi: a list of confidence levels ListID: a list of IDs ''' df_list = [] # take the data in an Epoch for Epoch in Epoch_lt: StartT, EndT = Epoch[0], Epoch[1] df_epoch = df[(df['GPStime'] >= StartT) & (df['GPStime'] <= EndT)] df_list.append(df_epoch) df_int = pd.concat(df_list) #### improvement for duration 2019-04-05 ############# #### full durations are too long for capturing the glitch signature so that the solution is to choose T90 ### In this situation, a gap that is manually chosen will be gap_new = gap_m + 0.05*T100 FractionEnergy = 1. # (90% of energy ) # note (1 - FractionEnergy)/2. is the fraction of the one-sided off T90 # df_int = df_int.assign(gap = gap + (1 - FractionEnergy)/2.*df_int['duration']) df_int = df_int.assign(gap=[gap] * df_int.shape[0]) df_int['duration'] = np.round(FractionEnergy*df_int['duration'], 3) # replace full durations (T100) with T90 # make the target segments # I reserve duration +/- gap around a GPS time of a glitch if UserDefinedDuration > 0: # accept a user defined duration # redefine only the duration of the target glitch df_int.loc[(df_int['ifo']==IFO) & (df_int['label']==TargetGlitchClass), ['duration']] = UserDefinedDuration else: pass df_int = df_int.assign(reserved_time_start = df_int['GPStime'] - position_duration_bfr_centr * df_int['duration'] - df_int['gap']) df_int = df_int.assign(reserved_time_end = df_int['GPStime'] + (1 - position_duration_bfr_centr) * df_int['duration'] + df_int['gap']) # reserving_time does not include gap #df_int = df_int.assign(reserved_time_start = df_int['GPStime'] - position_duration_bfr_centr * df_int['duration']) #df_int = df_int.assign(reserved_time_end = df_int['GPStime'] + (1 - position_duration_bfr_centr) * df_int['duration']) # exclude glitches with SNR <= BGSNR from df_int as they are quiet enough to be considered as background df_int = df_int[df_int['snr'] >= BGSNR_thre] # I think this is ok 2018-12-15 # data set containing only a target glitch class try: # gravity spy and omicron trigger mode df_glitch = df_int[(df_int['label'] == TargetGlitchClass) & (df_int['ifo'] == IFO) & (df_int['confidence'] >= Confidence_thre) & (df_int['snr'] >= targetSNR_thre) & (df_int['snr'] <= targetUpperSNR_thre) & (df_int['peakFreq'] >= TriggerPeakFreqLowerCutoff) & (df_int['peakFreq'] <= TriggerPeakFreqUpperCutoff) & (df_int['duration'] >= LowerDurationThresh) & (df_int['duration'] <= UpperDurationThresh)] except KeyError: # pyCBC mode as there is no column "peakFreq" # add central frequency cuts df_glitch = df_int[(df_int['label'] == TargetGlitchClass) & (df_int['ifo'] == IFO) & (df_int['confidence'] >= Confidence_thre) & (df_int['snr'] >= targetSNR_thre) & (df_int['snr'] <= targetUpperSNR_thre) & (df_int['centralFreq'] >= TriggerPeakFreqLowerCutoff) & (df_int['centralFreq'] <= TriggerPeakFreqUpperCutoff) & (df_int['duration'] >= LowerDurationThresh) & (df_int['duration'] <= UpperDurationThresh)] # for cWB triggers # df_glitch = df_int[(df_int['rho'] > 9) & (df_int['label'] == TargetGlitchClass) & (df_int['ifo'] == IFO) & (df_int['confidence'] >= Confidence_thre) & (df_int['snr'] >= targetSNR_thre) & (df_int['snr'] <= targetUpperSNR_thre) & (df_int['centralFreq'] >= TriggerPeakFreqLowerCutoff) & (df_int['centralFreq'] <= TriggerPeakFreqUpperCutoff) & (df_int['duration'] >= LowerDurationThresh) & (df_int['duration'] <= UpperDurationThresh)] print('all the glitches in the class: {}'.format(df_glitch.shape[0])) # the list of values satisfied as secure ListIndexSatisfied = [] Listtarget_timeseries_start = [] Listtarget_timeseries_end = [] Listpre_background_start = [] Listpre_background_end = [] Listfol_background_start = [] Listfol_background_end = [] Listgpstime = [] Listduration = [] ListSNR = [] Listconfi = [] ListID = [] for ind in range(len(df_glitch)): # target glitch's peak GPS time and the duration gpstime = df_glitch['GPStime'].iloc[ind] # a peak GPS time duration = df_glitch['duration'].iloc[ind] # # a duration SNR = df_glitch['snr'].iloc[ind] # snr confi = df_glitch['confidence'].iloc[ind] # confidence level ID = df_glitch['id'].iloc[ind] # ID gap_individual = df_glitch['gap'].iloc[ind] if duration > 0.02: # accept the duration of glitch is greater than 0.02 sec pass # go to the next line else: # if the duration of a glitch is less than 0.02 sec continue # go to the next iteration # the starting and ending time of the target glitch target_timeseries_start = gpstime - duration * position_duration_bfr_centr target_timeseries_end = target_timeseries_start + duration # take the background # the background is around the target glitch # preceding background, which is a nearest neighbor segment before target_timeseries_start # following background, which is a nearest neighbor segment after target_timeseries_end background_duration = 64 # duration # duration of both the proceeding and following background pre_background_start = target_timeseries_start - gap_individual - background_duration pre_background_end = pre_background_start + background_duration #target_timeseries_start - gap_individual fol_background_start = target_timeseries_end + gap_individual fol_background_end = fol_background_start + background_duration #target_timeseries_end + gap_individual + background_duration print('{:.1f}, {:.1f}, {:.1f}'.format(pre_background_end - pre_background_start, fol_background_end - fol_background_start, duration)) if round(pre_background_end - pre_background_start, 3) < round(duration, 3) or round( fol_background_end - fol_background_start, 3) < round(duration, 3): sys.exit() ListIndexSatisfied.append(ind) # append the index satisfied as secure Listtarget_timeseries_start.append(target_timeseries_start) Listtarget_timeseries_end.append(target_timeseries_end) Listpre_background_start.append(pre_background_start) Listpre_background_end.append(pre_background_end) Listfol_background_start.append(fol_background_start) Listfol_background_end.append(fol_background_end) Listgpstime.append(gpstime) Listduration.append(duration) ListSNR.append(SNR) Listconfi.append(confi) ListID.append(ID) # zipped list of segments Listsegments = list( zip(ListIndexSatisfied, Listtarget_timeseries_start, Listtarget_timeseries_end, Listpre_background_start, Listpre_background_end, Listfol_background_start, Listfol_background_end, Listgpstime, Listduration, ListSNR, Listconfi, ListID)) return Listsegments
[docs]def Multiprocess_whitening_for_target(full_timeseries, target_timeseries_start, target_timeseries_end): ''' description: This is used for multi processing for whitening segments USAGE: whitened_fft_target, sample_rate, DURATION = Multiprocess_whitening_for_target(full_timeseries, target_timeseries_start, target_timeseries_end) :param full_timeseries: time series comprising target and BGs :param target_timeseries_start: a start time of a target segment :param target_timeseries_end: an end time of a target segment :return: whitened_fft_target: whitened fft of a target segment sample_rate: sampling rate of this channel DURATION: a duration of a target segment ''' sample_rate = full_timeseries.sample_rate.value DURATION = target_timeseries_end - target_timeseries_start fourth_decimal_num_of_duration = (DURATION * 1000 - floor(DURATION * 1000)) * 10 # fourth decimal number DURATION = DURATION - fourth_decimal_num_of_duration * 0.0001 # truncate at the fourth decimal number # revise the duration to make an error in whitening process not not to occur # ValueError occurs in the whitening when the lengths of whitened data, whitening data and the filter are different third_decimal_num_of_duration = round((round(DURATION, 3) - round(DURATION, 2)) * 1000) if third_decimal_num_of_duration % 2 == 1: # odd number at the third decimal point DURATION = DURATION - 0.001 # make the duration with an even number at the third decimal point elif third_decimal_num_of_duration % 2 == 0: # even number at the third decimal point pass # keep the original duration as it is DURATION = round(DURATION, 3) # round it up at 3rd decimal point as the above three lines creates small number below the 3rd decimal point full_timeseries_start = full_timeseries.span[0] full_timeseries_end = full_timeseries.span[1] ave_background_asd = full_timeseries.crop(end=full_timeseries_start+128).asd(DURATION, DURATION * 0.5, method='median').value # whitening Target target_timeseries = full_timeseries.crop(start=target_timeseries_start, end=target_timeseries_end) if (full_timeseries.value == 0).all() is True: # if the time series is all zero whitened_fft_target = np.zeros(ave_background_asd.shape[0]) elif ave_background_asd[ave_background_asd != 0].shape[0] == 0: # if the time series is all zero whitened_fft_target = np.zeros(ave_background_asd.shape[0]) else: fft_target = target_timeseries.asd(DURATION, DURATION * 0.5, method='median').value # fft of the target # whitening the target using the average BG whitened_fft_target = fft_target / ave_background_asd return whitened_fft_target, sample_rate, DURATION
[docs]def SaveOnlyTargetHDF5_OFFLINE(Listsegments, re_sfchs, IFO, outputpath, outputfilename, number_process, PlusHOFT='False'): ''' description: THIS IS USED FOR "OFFLINE" MODE 0. assuming Listsegments is given by Findglitchlist() 1.take the information of the list of allowed target and a preceding and following segments 2. whiten a target segmement based on the average background segment 3. find whitened FFT 4. save the whitened target and backgrounds FFTs Note this depends on USAGE: SaveOnlyTargetHDF5_OFFLINE(Listsegments, re_sfchs, IFO, outputpath, outputfilename, number_process, PlusHOFT='False') :param Listsegments: a list of segment parameters :param re_sfchs: a list of safe channels :param outputpath: a directory of an output file :param outputfilename: a name of an output file :param number_process: a number of processes in parallel :param PlusHOFT: whether to get data of hoft, {'True' or 'False'}, 'False' in default ''' # check if the output directory exists if os.path.isdir(outputpath) is False: print("The directory for the output HDF5 file does not exist: path = {}".format(outputpath)) print("Please specify the existing directory for it.") sys.exit() # the list of the safe channels channels = ['{0}:{1}'.format(IFO, ch) for ch in re_sfchs] # the path and the file name outputPathFileName = os.path.join(outputpath, outputfilename) # make an output file try: f = h5py.File(outputPathFileName, 'w-') f.close() except IOError: print('The file already exists in {}'.format(outputPathFileName)) print('Please choose a different file name in a configuration file or delete this file if you want to overwrite it') sys.exit() # remove the astro cashe if os.path.isfile('~/.astro/cashe/downlowd/py3/lock') is True: os.remove('~/.astro/cashe/downlowd/py3/lock') # open a HDF5 file in writing mode with h5py.File(outputPathFileName, 'r+') as f: for IndexSatisfied, target_timeseries_start, target_timeseries_end, pre_background_start, pre_background_end, fol_background_start, fol_background_end, gpstime, duration, SNR, confi, ID in Listsegments: t0 = time.time() try: # deal with time series in bad state if pre_background_start != 0 and pre_background_end != 0 and fol_background_start != 0 and fol_background_end != 0: # if there is an available proceeding background and if there is an available following background full_timeseriesDict = TimeSeriesDict.find(channels, pre_background_start, fol_background_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process) elif pre_background_start != 0 and pre_background_end != 0 and fol_background_start == 0 and fol_background_end == 0: # if there is an available proceeding background and if there is NO available following background full_timeseriesDict = TimeSeriesDict.find(channels, pre_background_start, target_timeseries_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process) elif pre_background_start == 0 and pre_background_end == 0 and fol_background_start != 0 and fol_background_end != 0: # if there is NO available proceeding background and if there is an available following background full_timeseriesDict = TimeSeriesDict.find(channels, target_timeseries_start, fol_background_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process) else: pass channels_used = channels if PlusHOFT == 'True': # get the data of hoft if pre_background_start != 0 and pre_background_end != 0 and fol_background_start != 0 and fol_background_end != 0: # if there is an available proceeding background and if there is an available following background full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], pre_background_start, fol_background_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process) elif pre_background_start != 0 and pre_background_end != 0 and fol_background_start == 0 and fol_background_end == 0: # if there is an available proceeding background and if there is NO available following background full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], pre_background_start, target_timeseries_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process) elif pre_background_start == 0 and pre_background_end == 0 and fol_background_start != 0 and fol_background_end != 0: # if there is NO available proceeding background and if there is an available following background full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], target_timeseries_start, fol_background_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process) else: pass full_timeseriesDict.append(full_timeseriesDict_hoft) # append del full_timeseriesDict_hoft # delete the hoft object channels_used = np.append(channels, '{}:GDS-CALIB_STRAIN'.format(IFO)) # ===================================================== except IndexError: print('Bad data, go to a next GPS') continue # go to a next GPS time except RuntimeError: print('RuntimeError, go to a next GPS') continue except IOError: # error when getting time series print('IOError, go to a next GPS') continue except ValueError: # ValueError: Failed to read /hdfs/frames/O3/raw/H1/H-H1_R-12384/H-H1_R-1238489600-64.gwf: Unable to map from the stream class id (0) to internal class id at position: 18446744073709551601 [streamref: class: 0 instance: 0 length: 0 ] print('ValueError, go to a next GPS') continue g = f.create_group('gps{:05d}'.format(IndexSatisfied)) g.attrs['GPS'] = gpstime # a peak GPS time g.attrs['duration'] = duration g.attrs['snr'] = SNR # snr g.attrs['confidence'] = confi # confidence g.attrs['id'] = ID g.attrs['ifo'] = channels[0].split(':')[0] try: ch_labels = range(len(channels_used)) # a list of channel labels List_whitened_fft_target = [] # a list of whitened fft of target List_sample_rate = [] # a list of sample rates List_DURATION = [] # a list of durations # whitened target and BG segments with multi processes with concurrent.futures.ProcessPoolExecutor(max_workers=number_process) as executor: for whitened_fft_target, sample_rate, DURATION in executor.map(Multiprocess_whitening_for_target, full_timeseriesDict.values(), itrtls.repeat(target_timeseries_start), itrtls.repeat(target_timeseries_end) ): List_whitened_fft_target.append(whitened_fft_target) # fill List_sample_rate.append(sample_rate) # fill List_DURATION.append(DURATION) # fill for ch_label in ch_labels: # iterate through channel labels dsetTarget = g.create_dataset("chT{:05d}".format(ch_label), data=List_whitened_fft_target[ch_label], compression="gzip", compression_opts=9) # target dsetTarget.attrs['SamplingRate'] = List_sample_rate[ch_label] # target dsetTarget.attrs['channel'] = channels_used[ch_label].split(':')[1] # target dsetTarget.attrs['duration'] = List_DURATION[ch_label] # target except ValueError: del f['gps{:05d}'.format(IndexSatisfied)] print('*** delete group gps{:05d} due to ASD estimation error ***'.format(IndexSatisfied)) continue # go to a next GPS time except IOError: # IOError: Driver write request failed del f['gps{:05d}'.format(IndexSatisfied)] print('*** delete group gps{:05d} due to memory error ***'.format(IndexSatisfied)) continue # go to a next GPS time print('Complete saving a sample as gps{:05d}'.format(IndexSatisfied)) t1 = time.time() print('Time taken: {} s'.format(t1 - t0))
[docs]def cal_importance_single_channel_singl_freqband_from_bg_up_bd_prior(whitened_fft_target, sampling_rate, DURATION, poly_para, LowerCutOffFreq, UpperCutOffFreq): """ description: calculate the importance for a single channel in a given frequency band USAGE: Count = cal_importance_single_channel_singl_freqband_from_bg_up_bd_prior(whitened_fft_target, sampling_rate, DURATION, poly_para, LowerCutOffFreq, UpperCutOffFreq) :param whitened_fft_target: on-source window normalized spectrum :param sampling_rate: sample rate :param DURATION: duration of on-source window :param poly_para: polynomial parameters of the fit of the background upper threshold as a function of on-source window length :param LowerCutOffFreq: :param UpperCutOffFreq: :return: """ sp = whitened_fft_target bg_up_thresh_fitted = np.poly1d(poly_para) GausCeiling = bg_up_thresh_fitted(DURATION) if np.isnan(sp).any() != True: # if sp has data measured # low, high, or band pass filter if we know which frequency band is important if LowerCutOffFreq != 'None' and UpperCutOffFreq == 'None': # high-pass filter ## pT, pP, and pF should be zero if LowerCutOffFreq is greater than the half of the sampling rate as there is no data. However pT would be non-zero due to the bad resolution (a very short giltch) if sampling_rate / 2 <= LowerCutOffFreq: sp = np.array([0]) else: LowerCutOffElement = int(LowerCutOffFreq * DURATION) if LowerCutOffElement <= sp.shape[-1]: # if LowCutOffElement is within the length of the vector sp = sp[LowerCutOffElement:] else: # if LowerCutOffElement is greater than the length of the vector sp = np.array([0]) elif LowerCutOffFreq == 'None' and UpperCutOffFreq != 'None': # low-pass filter # even if UpperCutOffElement is greater than the length of the vector is works fine UpperCutOffElement = int(UpperCutOffFreq * DURATION) sp = sp[:UpperCutOffElement] elif LowerCutOffFreq != 'None' and UpperCutOffFreq != 'None': # band pass filter ## pT, pP, and pF should be zero if LowerCutOffFreq is greater than the half of the sampling rate as there is no data. However pT would be non-zero due to the bad resolution (a very short giltch) if sampling_rate / 2 <= LowerCutOffFreq: sp = np.array([0]) else: LowerCutOffElement = int(LowerCutOffFreq * DURATION) UpperCutOffElement = int(UpperCutOffFreq * DURATION) if LowerCutOffElement <= sp.shape[-1]: # if LowCutOffElement is within the length of the vector sp = sp[LowerCutOffElement:UpperCutOffElement] else: # if LowCutOffElement is greater than the length of the vector sp = np.array([0]) else: pass if GausCeiling == 0: Count = 0 else: Count = sp[sp > GausCeiling].shape[0] / float(sp.shape[0]) else: Count = 0 # manually put zero because this channel has no contribution at this glitch ID return Count
[docs]def HierarchyChannelAboveThreshold_single_channel_multiband_from_bg_up_bd_prior(whitened_fft_target, sampling_rate, duration, list_poly_para): ''' description: calculate the importance for a signle channel for frequency bands USAGE: Counts_in_multibands = HierarchyChannelAboveThreshold_single_channel_multiband_from_bg_up_bd_prior(whitened_fft_target, sampling_rate, duration, list_poly_para) :param whitened_fft_target: whitened fft of the on-source window :param duration: a duration of the on-source window :param sampling_rate: sampling rate of a channel :param list_poly_para: a list of polynomial fit of the background upper threshold per freq band :return: Counts_in_multibands: values of importance in different frequency bands, numpy array ''' # iterate different frequency bands in the list Counts_in_multibands = np.array([cal_importance_single_channel_singl_freqband_from_bg_up_bd_prior(whitened_fft_target, sampling_rate, duration, list_poly_para[freq_band_ind], LowerCutOffFreq=Const.freq_bands[freq_band_ind][0], UpperCutOffFreq=Const.freq_bands[freq_band_ind][1]) for freq_band_ind in range(len(Const.freq_bands))]) return Counts_in_multibands
[docs]def CreateRho_single_channel_multiband_from_bg_up_bd_prior(full_timeseries, target_timeseries_start, target_timeseries_end, list_poly_para): ''' discription: 1. calculate the normalized spectrum 2. compute the value of importance for a single channel across frequency bands USAGE: Counts_in_multibands, sample_rate = CreateRho_single_channel_multiband_from_bg_up_bd_prior(full_timeseries, target_timeseries_start, target_timeseries_end, list_poly_para) :param full_timeseries: the full time series in gwpy object including on- and off source windows :param target_timeseries_start: the start time of the on-source window :param target_timeseries_end: the end time of the on-source window :param list_poly_para: a list of polynomial fit of the background upper threshold per freq band :return: Counts_in_multibands: values of importance in different frequency bands where importance is a fraction of frequency bins in a frequency range above an upper bound of the off-source window for a single channel sample_rate: a sampling rate of a single channel ''' # calculate the whitened FFT of the on- and off-source windows whitened_fft_target, sample_rate, duration = Multiprocess_whitening_for_target(full_timeseries, target_timeseries_start, target_timeseries_end) # calculate the importance Counts_in_multibands = HierarchyChannelAboveThreshold_single_channel_multiband_from_bg_up_bd_prior(whitened_fft_target, sample_rate, duration, list_poly_para) return Counts_in_multibands, sample_rate
[docs]def CreateAllChannels_rho_multband_from_bg_up_bd_prior(Listsegment, IFO, re_sfchs, number_process, PlusHOFT, hdf5_obj_bg_up_thresh): ''' description: 1. use a single glitch time 2. query timeseries of all the channels around a glitch 3. calculate the normalized spectrum 4. compute the value of importance for a single channel across frequency bands USAGE: IndexSatisfied, Mat_Count_in_multibands, list_sample_rates, re_sfchs, gpstime, duration, SNR, confi, ID = CreateAllChannels_rho_multband_from_bg_up_bd_prior(Listsegment, IFO, re_sfchs, number_process, PlusHOFT, hdf5_obj_bg_up_thresh) :param Listsegment: a list of segment parameters :param IFO :param channels: a list of safe channels :param number_process: a number of processes in parallel :param PlusHOFT: whether to get data of hoft, {'True' or 'False'} :param hdf5_obj_bg_up_thresh: a HDF5 object that contains the polynomial parameters of the fit that represent the background upper theshold as a functin of on-source window length for all the channels and freq bands :return: IndexSatisfied: glitch index Mat_Count_in_multibands: a matrix of rho where frequencies in rows and channels in columns, numpy array list_sample_rates: a list of sampling rates of channels, numpy array re_sfchs: list of channels without "IFO:" at the beginning gpstime: a gps time duration: a value of duration SNR: signal to noise ratio in the h(t) conf: a confidence level of classification of a glitch, provided by Gravity Spy. Otherwise None ID: a glitch ID, provided by Gravity Spy in usual ''' # get the polynomical parameters fitted the background upper threshold across channels and the frequency bands poly_para_dict = {} list_g = list(hdf5_obj_bg_up_thresh.keys()) num_freq_band = len(list(hdf5_obj_bg_up_thresh[list_g[0]].keys())) for ch in hdf5_obj_bg_up_thresh.keys(): g = hdf5_obj_bg_up_thresh[ch] poly_para_dict[ch] = [g["BP{:d}".format(freq_band_ind)][()] for freq_band_ind in range(num_freq_band)] channels = ['{0}:{1}'.format(IFO, ch) for ch in re_sfchs] # reorder the keys in the backgroud upper thresh dictionary poly_para_dict = {ch: poly_para_dict[ch] for ch in re_sfchs} IndexSatisfied, target_timeseries_start, target_timeseries_end, pre_background_start, pre_background_end, fol_background_start, fol_background_end, gpstime, duration, SNR, confi, ID = Listsegment t0 = time.time() # extracting the target data and the proceeding and following backgrounds try: # deal with time series in bad state if pre_background_start != 0 and pre_background_end != 0 and fol_background_start != 0 and fol_background_end != 0: # if there is an available proceeding background and if there is an available following background full_timeseriesDict = TimeSeriesDict.find(channels, pre_background_start, fol_background_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process) elif pre_background_start != 0 and pre_background_end != 0 and fol_background_start == 0 and fol_background_end == 0: # if there is an available proceeding background and if there is NO available following background full_timeseriesDict = TimeSeriesDict.find(channels, pre_background_start, target_timeseries_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process) elif pre_background_start == 0 and pre_background_end == 0 and fol_background_start != 0 and fol_background_end != 0: # if there is NO available proceeding background and if there is an available following background full_timeseriesDict = TimeSeriesDict.find(channels, target_timeseries_start, fol_background_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process) else: pass if PlusHOFT == 'True': # get the data of hoft if pre_background_start != 0 and pre_background_end != 0 and fol_background_start != 0 and fol_background_end != 0: # if there is an available proceeding background and if there is an available following background full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], pre_background_start, fol_background_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process) elif pre_background_start != 0 and pre_background_end != 0 and fol_background_start == 0 and fol_background_end == 0: # if there is an available proceeding background and if there is NO available following background full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], pre_background_start, target_timeseries_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process) elif pre_background_start == 0 and pre_background_end == 0 and fol_background_start != 0 and fol_background_end != 0: # if there is NO available proceeding background and if there is an available following background full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], target_timeseries_start, fol_background_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process) else: pass full_timeseriesDict.append(full_timeseriesDict_hoft) # append del full_timeseriesDict_hoft # delete the hoft object re_sfchs = np.append(re_sfchs, 'GDS-CALIB_STRAIN') except IndexError as err: print("IndexError: {}".format(err.message)) # IndexError: After reading object of type: FrAdcData stream is no longer in good state. ( Frmae: 0 ChannelType: L1:ASC-X_TR_A_NSUM_OUT_DQ offset: 932092099) return 0, 0, 0, 0, 0, 0, 0, 0, 0 except RuntimeError as err: print("RuntimeError: {}".format(err.message)) return 0, 0, 0, 0, 0, 0, 0, 0, 0 except IOError as err: # error when getting time series print('IOError: {}'.format(err.message)) return 0, 0, 0, 0, 0, 0, 0, 0, 0 except ValueError: # ValueError: Failed to read /hdfs/frames/O3/raw/H1/H-H1_R-12384/H-H1_R-1238489600-64.gwf: Unable to map from the stream class id (0) to internal class id at position: 18446744073709551601 [streamref: class: 0 instance: 0 length: 0 ] print('ValueError') return 0, 0, 0, 0, 0, 0, 0, 0, 0 try: Mat_Count_in_multibands = np.array([]) # a list of rho list_sample_rates = np.array([]) # whitened target and BG segments with multi processes with concurrent.futures.ProcessPoolExecutor(max_workers=number_process) as executor: for Counts_in_multibands, sample_rate in executor.map(CreateRho_single_channel_multiband_from_bg_up_bd_prior, full_timeseriesDict.values(), itrtls.repeat(target_timeseries_start), itrtls.repeat(target_timeseries_end), poly_para_dict.values() ): # fill #Mat_Count_in_multibands = np.vstack([Mat_Count_in_multibands, Counts_in_multibands]) if Mat_Count_in_multibands.size else Counts_in_multibands Counts_in_multibands = Counts_in_multibands.reshape(Counts_in_multibands.shape[0], 1) Mat_Count_in_multibands = np.hstack([Mat_Count_in_multibands, Counts_in_multibands]) if Mat_Count_in_multibands.size else Counts_in_multibands list_sample_rates = np.append(list_sample_rates, sample_rate) except ValueError: print('*** delete group gps{:05d} due to ASD estimation error ***'.format(IndexSatisfied)) return 0, 0, 0, 0, 0, 0, 0, 0, 0 except IOError: # IOError: Driver write request failed print('*** delete group gps{:05d} due to memory error ***'.format(IndexSatisfied)) return 0, 0, 0, 0, 0, 0, 0, 0, 0 t1 = time.time() print('Time taken: {} s'.format(t1 - t0)) return IndexSatisfied, Mat_Count_in_multibands, list_sample_rates, re_sfchs, gpstime, duration, SNR, confi, ID # pass the parameters
[docs]def SaveOnlyTargetHDF5_multiband_OFFLINE_from_bg_up_bd_prior(Listsegments, re_sfchs, IFO, outputpath, outputfilename, number_process, path_hdf5_bg_up_thresh, PlusHOFT='False'): ''' description: THIS IS USED FOR "OFFLINE" MODE 1. take the information of the list of allowed target and a preceding and following segments 2. query time series and get normalized spectrum and calculate importance 4. save the whitened target and backgrounds FFTs Note this depends on USAGE: SaveOnlyTargetHDF5_multiband_OFFLINE_from_bg_up_bd_prior(Listsegments, re_sfchs, IFO, outputpath, outputfilename, number_process, path_hdf5_bg_up_thresh, PlusHOFT='False') :param Listsegments: a list of segment parameters :param re_sfchs: a list of safe channels :param outputpath: a directory of an output file :param outputfilename: a name of an output file :param number_process: a number of processes in parallel :param path_hdf5_bg_up_thresh: path to a HDF5 file that contains the polynomial fit of the background upper threshold :param PlusHOFT: whether to get data of hoft, {'True' or 'False'}, 'False' in default :return None ''' # load a HDF5 file that contains the polynomical parameters extrapolating the background upper threshold if os.path.isfile(path_hdf5_bg_up_thresh) is False: print("No such file for the BG upper threshold: {}".format(path_hdf5_bg_up_thresh)) f_bg_up_thresh = h5py.File(path_hdf5_bg_up_thresh, 'r') # the path and the file name outputPathFileName = os.path.join(outputpath, outputfilename) # make an output file try: f = h5py.File(outputPathFileName, 'w-') f.close() except IOError: print('The file already exists in {}'.format(outputPathFileName)) print('Please choose a different file name in the configuration file or delete this file if you want to overwrite it') sys.exit() # remove the astro cashe if os.path.isfile('~/.astro/cashe/downlowd/py3/lock') is True: os.remove('~/.astro/cashe/downlowd/py3/lock') # open a HDF5 file in writing mode with h5py.File(outputPathFileName, 'r+') as f: for Listsegment in Listsegments: IndexSatisfied, Mat_Count_in_multibands, list_sample_rates, re_sfchs_used, gpstime, duration, SNR, confi, ID = CreateAllChannels_rho_multband_from_bg_up_bd_prior(Listsegment, IFO, re_sfchs, number_process, PlusHOFT, f_bg_up_thresh) if type(duration) == int: # if some error occurs, List_rho is 0, go to the next iteration continue else: g = f.create_group('gps{:05d}'.format(IndexSatisfied)) g.attrs['GPS'] = gpstime # a peak GPS time g.attrs['duration'] = duration g.attrs['snr'] = SNR # snr g.attrs['confidence'] = confi # confidence g.attrs['id'] = ID g.attrs['ifo'] = IFO dset_mat_rho = g.create_dataset("mat_rho", data=Mat_Count_in_multibands, compression="gzip", compression_opts=9) # a matrix of rho dset_sample_rates = g.create_dataset("sample_rates", data=list_sample_rates, compression="gzip", compression_opts=9) # list of sample rates of channels dset_channels = g.create_dataset("channels", data=[re_sfch.encode('utf8') for re_sfch in re_sfchs_used], compression="gzip", compression_opts=9) # list of channels freq_bands = np.array(Const.freq_bands).tolist() for i in range(len(freq_bands)): for j in range(len(freq_bands[0])): freq_bands[i][j] = freq_bands[i][j].encode('utf8') dset_freq_bands = g.create_dataset("freq_bands", data=freq_bands, compression="gzip", compression_opts=9) # matrix of frequency bnads #dsetTarget.attrs['duration'] = List_DURATION[ch_label] # target with h5py.File(outputPathFileName, 'r') as f: print(f['gps{:05d}'.format(IndexSatisfied)]['mat_rho'][()] )
# # # outputpath = "/home/kentaro.mogushi/public_html/longlive/Plots/GravitySpy/test_extendBG/null_m/HDF5data" # outputfilename = "MagnetometerL1.h5" # # list_all_dummy_duration_sorted_file, mat_ch_sorted_dict_file = make_bg_up_thesh_interpolate_hdf5(outputpath, outputfilename) # save_interpolate_bg_upper_thres_hdf5(list_all_dummy_duration_sorted_file, mat_ch_sorted_dict_file, '.', 'bg_up_thre.h5') # # # # #%%%%%%%%%%%%%%%% # # # #------ # # target # #--------- # path = "/home/kentaro.mogushi/public_html/longlive/Plots/GravitySpy/test_extendBG/target/HDF5data/MagnetometerL1.h5" # f_t = h5py.File(path, 'r') # list_g = list(f_t.keys()) # # --- channels ---- # list_channels = np.array([]) # g = f_t[list_g[0]] # for dset_ind in g.keys(): # if dset_ind[:2] != "ch": # continue # channel = g[dset_ind].attrs["channel"] # list_channels = np.append(list_channels, channel) # # # # ch = 'PEM-EX_MAG_EBAY_SUSRACK_QUAD_SUM_DQ' # #ch = 'ASC-SRC2_Y_OUT_DQ' # ch_ind_int = np.where(list_channels == ch)[0][0] # list_sp_mean = np.array([]) # list_sp_std = np.array([]) # list_duration = np.array([]) # list_gpstime = np.array([]) # list_sp = [] # for gps in f_t.keys(): # g = f_t[gps] # sp = g['chT{:05d}'.format(ch_ind_int)][()] # list_sp.append(sp) # sp_mean = sp.mean() # sp_std = sp.std(ddof=1) # gpstime = g.attrs['GPS'] # list_gpstime = np.append(list_gpstime, gpstime) # duration = g['chT{:05d}'.format(ch_ind_int)].attrs['duration'] # list_sp_mean = np.append(list_sp_mean, sp_mean) # list_sp_std = np.append(list_sp_std, sp_std) # list_duration = np.append(list_duration, duration) # # # # #----------- # # priori # #------------ # outputpath = "/home/kentaro.mogushi/public_html/longlive/Plots/GravitySpy/test_extendBG/null_m/HDF5data" # outputfilename = "MagnetometerL1.h5" # # # path_outputfile = os.path.join(outputpath, outputfilename) # # # f = h5py.File(path_outputfile, 'r') # list_g = list(f.keys()) # # #------------------- # # get the list of channels # #--------------------- # list_channels = np.array([]) # g = f[list_g[0]] # for dset_ind in g.keys(): # if dset_ind[:2] != "ch": # continue # channel = g[dset_ind].attrs["channel"].decode('utf8') # list_channels = np.append(list_channels, channel) # list_freq_bands = g['freq_bands'][()] # #---------------- # # get dummy durations # # freq bands # #---------------- # # # ch_ind_int = np.where(list_channels == ch)[0][0] # # # -- get the backgournd upper thresshod for all the glitches and all the channels # mat_ch_dict = {} # list_all_dummy_duration = np.array([]) # for gps_ind in f.keys(): # iterate glitches # #if gps_ind != list_g[0]: # # continue # g = f[gps_ind] # list_dummy_duration = g['dummy_durations'][()] # list_all_dummy_duration = np.append(list_all_dummy_duration, list_dummy_duration) # for ch_ind in range(len(list_channels)): # iterate channels # #if ch_ind != ch_ind_int: # # continue # #print(ch_ind) # bg_up_mat = g["ch{:05d}".format(ch_ind)][()] # if gps_ind == list_g[0]: # mat_ch_dict[list_channels[ch_ind]] = bg_up_mat # else: # mat_ch_dict[list_channels[ch_ind]] = np.hstack([mat_ch_dict[list_channels[ch_ind]], bg_up_mat]) if mat_ch_dict[list_channels[ch_ind]].size else bg_up_mat # # # #-- short the upper threshold ascendingly in the order of dummy durations # sorted_index = np.argsort(list_all_dummy_duration) # sort index ascending # list_all_dummy_duration_sorted = list_all_dummy_duration[sorted_index] # mat_ch_sorted_dict = {} # for channel in mat_ch_dict.keys(): # mat_ch_dict[channel][:, sorted_index] # mat_ch_sorted_dict[channel] = mat_ch_dict[channel][:, sorted_index] # # # # -- fit # # file_bg_up_th_hdf5 = 'bg_up_thre.h5' # try: # f = h5py.File(file_bg_up_th_hdf5, 'w-') # f.close() # except IOError: # print('The file already exists in {}'.format(file_bg_up_th_hdf5)) # os.system('rm {}'.format(file_bg_up_th_hdf5)) # # open a HDF5 file in writing mode # with h5py.File(file_bg_up_th_hdf5, 'a') as f: # for channel in list_channels: # print(channel) # g = f.create_group(channel) # for band_ind in range(bg_up_mat.shape[0]): # print('freq band {}'.format(band_ind)) # # remove nan samples # bg_up_bd_nonnan_ind = np.where(np.isnan(mat_ch_sorted_dict[ch][band_ind, :]) == False)[0] # sample_x = list_all_dummy_duration_sorted[bg_up_bd_nonnan_ind] # sample_y = mat_ch_sorted_dict[ch][band_ind, :][bg_up_bd_nonnan_ind] # # remove extreme large outliers # median = np.median(sample_y) # med_abs_dev = stats.median_absolute_deviation(sample_y) # if med_abs_dev == 0: # if all the values are zero, med_abs_dev will be zero # med_abs_dev = 1 # non_ext_outliner_ind = np.where((median - 6 * med_abs_dev < sample_y) & (sample_y < median + 6 * med_abs_dev))[0] # sample_x = sample_x[non_ext_outliner_ind] # sample_y = sample_y[non_ext_outliner_ind] # z = np.polyfit(sample_x, sample_y, 10) # print('0000') # dset = g.create_dataset("BP{band_ind:d}".format(band_ind=int(band_ind)), data=z, compression="gzip", compression_opts=9) # print('11111') # dset.attrs['channel'] = channel.encode('utf8') # print('22222') # dset.attrs['freqband'] = int(band_ind) # # # # # plot # # f_up = h5py.File("/home/kentaro.mogushi/public_html/longlive/Plots/GravitySpy/test_extendBG/bg_up_thre.h5", 'r') # for band_ind in range(8)[7:]: # plt.plot(list_all_dummy_duration_sorted, mat_ch_sorted_dict[ch][band_ind, :], label='band {}'.format(band_ind), color='gray', alpha=0.5) # # remove nan samples # bg_up_bd_nonnan_ind = np.where(np.isnan(mat_ch_sorted_dict[ch][band_ind, :]) == False)[0] # sample_x = list_all_dummy_duration_sorted[bg_up_bd_nonnan_ind] # sample_y = mat_ch_sorted_dict[ch][band_ind, :][bg_up_bd_nonnan_ind] # # remove extreme large outliers # median = np.median(sample_y) # med_abs_dev = stats.median_absolute_deviation(sample_y) # if med_abs_dev == 0: # med_abs_dev = 1 # non_ext_outliner_ind = np.where((median - 6 * med_abs_dev < sample_y) & (sample_y < median + 6 * med_abs_dev))[0] # sample_x = sample_x[non_ext_outliner_ind] # sample_y = sample_y[non_ext_outliner_ind] # z = np.polyfit(sample_x, sample_y, 10) # print(z) # p = np.poly1d(z) # plt.plot(list_all_dummy_duration_sorted, p(list_all_dummy_duration_sorted), color='blue', ls='-', alpha=0.5, # label='polynomial fit') # # #### # z = f_up[ch]["BP{:d}".format(band_ind)][()] # print(z) # p = np.poly1d(z) # plt.plot(list_all_dummy_duration_sorted, p(list_all_dummy_duration_sorted), color='blue', ls='--', alpha=0.5, label='polynomial fit from the file') # #plt.errorbar(list_duration, list_sp_mean, yerr=list_sp_std, fmt='o', color='red') # plt.legend() # plt.title(ch) # plt.yscale('log') # plt.xscale('log') # #plt.ylim(median - 6 * med_abs_dev, median + 6 * med_abs_dev) # plt.ylabel("upper cut") # plt.xlabel("duration [sec]") # plt.savefig('{}.png'.format(ch)) # plt.close() # # # importance # list_imp = [] # for t_duration_ind in range(len(list_duration)): # sp = list_sp[t_duration_ind] # imp = sp[sp > p(list_duration[t_duration_ind])].shape[0] / float(sp.shape[0]) # list_imp.append(imp) # # plt.scatter(list_duration, list_imp) # plt.title(ch) # plt.xscale('log') # plt.ylabel("rho") # plt.xlabel("duration [sec]") # plt.savefig("imp_{}.png".format(ch)) # plt.close() # # # # # # # # # # gps = 1262368238 # gps1 = 1262369268 # d = 64 # fft_length = 4 # gwdata = TimeSeries.get('L1:GDS-CALIB_STRAIN', gps-d/2., gps+d/2.) # duration = 3.45 # on = gwdata.crop(end=gwdata.span[0] + duration) # spectrum = gwdata.asd(fft_length, fft_length /2, method='median') # interpolated_spectrum = spectrum.interpolate(round(1/duration, 5)) # # # on_asd = on.asd(duration, round(duration /2, 3), method='median') # # # print("spectrum df= {} Hz".format(spectrum.df)) # print("interpolated spectrum df= {} Hz".format(interpolated_spectrum.df)) # print("on spectrum df= {} Hz".format(on_asd.df)) # # # gps = 1262368238 # glitchy # gps1 = 1262369268 # quiet # d = 64 # gwdata = TimeSeries.get('L1:GDS-CALIB_STRAIN', gps-d/2., gps+d/2.) # gwdata1 = TimeSeries.get('L1:GDS-CALIB_STRAIN', gps1-d/2., gps1+d/2.) # gwdata2 = TimeSeries.get('L1:GDS-CALIB_STRAIN', gps1-1., gps1+1.) # # # list_fft_length = [16, 8, 4] # list_fft_length = [4] # list_method = ['median', 'welch'] # #list_method = ['median'] # for fft_length in list_fft_length: # for method in list_method: # if method == 'welch': # ls = '--' # else: # ls = '-' # spectrum = gwdata.asd(fft_length, fft_length /2, method=method) # spectrum1 = gwdata1.asd(fft_length, fft_length / 2, method=method) # spectrum2 = gwdata2.asd(fft_length, fft_length / 2, method=method) # plt.plot(spectrum.frequencies, spectrum.value, label='fft length = {}, method = {}'.format(fft_length, method), alpha=0.5) # plt.plot(spectrum1.frequencies, spectrum1.value, label='fft length = {}, method = {}, quiet'.format(fft_length, method), alpha=0.5) # plt.plot(spectrum2.frequencies, spectrum2.value, label='fft length = {}, method = {}, quiet, short'.format(fft_length, method), alpha=0.5) # plt.title("duration = {} sec".format(d)) # plt.xlim(40, 4000) # plt.ylabel(r'GW strain ASD [strain$/\sqrt{\mathrm{Hz}}$]') # plt.ylim(1e-24, 3e-21) # plt.xscale('log') # plt.yscale('log') # plt.legend() # plt.savefig("spectrum.png") # plt.close() # # # # import time as TIME # # from gwpy.timeseries import TimeSeries, TimeSeriesDict # # gps = 1175755932.81 # # pre_background_start = gps - 64 # # fol_background_end = gps + 64 # # IFO = "L1" # # number_process = 5 # # channels = ["L1:PEM-EX_MAG_EBAY_SUSRACK_QUAD_SUM_DQ"] # # t0 = TIME.time() # # full_timeseriesDict = TimeSeriesDict.find(channels, pre_background_start, fol_background_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process) # # t1 = TIME.time() # # print("Time taken in {} sec".format(t1 - t0)) # # # # # # import pandas as pd # import os # import sys # import matplotlib # matplotlib.use("Agg") # import matplotlib.pyplot as plt # # # # #------------ # # find the quiet time # #-------------- # # path = '/home/kentaro.mogushi/longlived/MachineLearningJointPisaUM/dataset/GravityspyDataSet/O2/gspy-db-20180813.csv' # df = pd.read_csv(path) # # df = df.drop_duplicates(subset=["GPStime", "duration"]).reset_index() # df = df.assign(reserved_time_start=df['GPStime'] - df['duration'] / 2. - 0.1) # df = df.assign(reserved_time_end=df['GPStime'] + df['duration'] / 2. + 0.1) # # # # # # # # # # # # # # # # for i in range(df.shape[0])[200000:200100]: # # plt.plot([df.iloc[i]["reserved_time_start"] - res_start.min(), df.iloc[i]["reserved_time_end"] - res_start.min()], [i,i]) # # # # plt.ylabel("Glitch index") # # plt.xlabel("on-source window") # # plt.savefig("glitch.png") # # plt.close() # # # c = res_start[1:] - res_end[:-1] # # # # # df_q = pd.DataFrame() # # # plt.hist(c, bins=50) # plt.ylabel("Count") # plt.xlabel("Quiet time length [sec]") # plt.savefig("quiet_time_length.png") # plt.close() # # # #