Source code for origli.utilities.burn_in_utilities

#!/usr/bin/env python

import time
import h5py
from gwpy.timeseries import TimeSeries, TimeSeriesDict
from gwpy.timeseries import TimeSeries
import pandas as pd
from math import floor
import concurrent.futures
import itertools as itrtls
import os
import sys
import numpy as np
from scipy import stats
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

sys.path.append(os.path.join(os.path.dirname(__file__), '..'))

import origli.utilities.const as Const # for multi frequency bands defined in
from origli.utilities.utilities import select_some_trials
from origli.utilities.utilities import IdentifyGlitch, FindRadomlistPoints


#------------------------------
# for selecting the null samples
#--------------------------

[docs]def FindBGlis_extendBG(state, number_trials, step, outputMother_dir, df, Epoch_lt, TargetGlitchClass, IFO, BGSNR_thre, targetSNR_thre, Confidence_thre, UpperDurationThresh, LowerDurationThresh, UserDefinedDuration, gap, TriggerPeakFreqLowerCutoff=0, TriggerPeakFreqUpperCutoff=8192, targetUpperSNR_thre=np.inf):
    '''
    description: get the null samples where their durations are drawn from the target set. The only subset of the null samples are chosen if their on-source windows do not coincide with any other glitches.
        1. gltich file (.csv file)
        2. the target samples
        3. create random time stamps with their durations drawn from the target set
        4. accept the random time stamps where their on-source windows do not coincide with any other glitches
        5. return the info of the accepted glitches
    USAGE: Listsegments = FindBGlis_extendBG(state, number_trials, step, outputMother_dir, df, Epoch_lt, TargetGlitchClass, IFO, BGSNR_thre, targetSNR_thre, Confidence_thre, UpperDurationThresh, LowerDurationThresh, UserDefinedDuration, gap, TriggerPeakFreqLowerCutoff, TriggerPeakFreqUpperCutoff, targetUpperSNR_thre)

    :param df: GravitySpy meta data in pandas format
    :param Epochstart: starting time of an epoch
    :param Epochend:  end time of an epoch
    :param Commissioning_lt: commissioning time in list
    :param TargetGlitchClass: a target glitch class name (str)
    :param IFO: a type of interferometer (H1, L1, V1)  (str)
    :param BGSNR_thre: an upper threhold of SNR for background glitches, i.e., quiet enough ), float or int
    :param targetSNR_thre: a lower threshold of SNR for target glitches, float or int
    :param Confidence_thre: a threshold of confidence level (float or int )
    :param UpperDurationThresh: an upper bound of duration in sec (float or int)
    :param LowerDurationThresh: a lower bound of duration in sec (float or int)
    :param UserDefinedDuration: user defined duration of a glitch (float or int), 0 in default
    :param gap: a time gap between the target and the background segments in sec, 1 sec in default
    :param TriggerPeakFreqUpperCutoff: the lower SNR threshold for selecting the target set
    :param targetUpperSNR_thre: the upper SNR threshold for selecting the target set
    :return: the list of parameters of glitches passing the above thresholds
        Listsegments contains of
            ListIndexSatisfied: a list of index of glitches
            Listtarget_timeseries_start: a list of a target glitch starting time
            Listtarget_timeseries_end: a list of a target glitch ending time
            Listpre_background_start: a list of preceding background starting time
            Listpre_background_end; a list of preceding background ending time
            Listfol_background_start: a list of following background starting time
            Listfol_background_end: a list of following background ending time
            Listgpstime: a list of GPS times
            Listduration: a list of durations
            ListSNR: a list of SNRs
            Listconfi: a list of confidence levels
            ListID: a list of IDs
    '''

    df_list = []
    # take the data in an Epoch
    for Epoch in Epoch_lt:
        StartT, EndT = Epoch[0], Epoch[1]
        df_epoch = df[(df['GPStime'] >= StartT) & (df['GPStime'] <= EndT)]
        df_list.append(df_epoch)
    df_int = pd.concat(df_list)

    # data set containing only a target glitch class
    try:  # gravity spy and omicron trigger mode
        df_target = df_int[(df_int['label'] == TargetGlitchClass) & (df_int['ifo'] == IFO) & (df_int['confidence'] >= Confidence_thre) & (df_int['snr'] >= targetSNR_thre) & (df_int['snr'] <= targetUpperSNR_thre) & (df_int['peakFreq'] >= TriggerPeakFreqLowerCutoff) & (df_int['peakFreq'] <= TriggerPeakFreqUpperCutoff) & (df_int['duration'] >= LowerDurationThresh) & (df_int['duration'] <= UpperDurationThresh)]
    except KeyError:  # pyCBC mode as there is no column "peakFreq"
        df_target = df_int[(df_int['label'] == TargetGlitchClass) & (df_int['ifo'] == IFO) & (df_int['confidence'] >= Confidence_thre) & (df_int['snr'] >= targetSNR_thre) & (df_int['snr'] <= targetUpperSNR_thre) & (df_int['centralFreq'] >= TriggerPeakFreqLowerCutoff) & (df_int['centralFreq'] <= TriggerPeakFreqUpperCutoff) & (df_int['duration'] >= LowerDurationThresh) & (df_int['duration'] <= UpperDurationThresh)]
        #for cWB
        #df_target = df_int[(df_int['rho'] > 9) & (df_int['label'] == TargetGlitchClass) & (df_int['ifo'] == IFO) & (df_int['confidence'] >= Confidence_thre) & (df_int['snr'] >= targetSNR_thre) & (df_int['snr'] <= targetUpperSNR_thre) & (df_int['centralFreq'] >= TriggerPeakFreqLowerCutoff) & (df_int['centralFreq'] <= TriggerPeakFreqUpperCutoff) & (df_int['duration'] >= LowerDurationThresh) & (df_int['duration'] <= UpperDurationThresh)]

    # generate synthetic data points
    df_random = FindRadomlistPoints(state, IFO, Epoch_lt, number_trials, step,  outputMother_dir, df_target[df_target.snr > BGSNR_thre])
    #df_random = FindShiftedPoints(state, IFO, Epoch_lt, number_trials, step, outputMother_dir, df_target[df_target.snr > BGSNR_thre])

    df_int = df_int.append(df_random, ignore_index=True) # append the dataset of random points

    #### improvement for duration 2019-07-02 #############
    #### full durations are too long for capturing the glitch signature so that the solution is to choose T90
    ### In this situation, a gap that is manually chosen will be gap_new = gap_m + 0.05*T100
    ### This function is for getting quite segments so that I concervatively choose the longest fraction of duration, in default
    FractionEnergy = 1  # (100% of energy )
    # note (1 - FractionEnergy)/2. is the fraction of the one-sided off T90
    df_int = df_int.assign(gap=gap + (1 - FractionEnergy) / 2. * df_int['duration'])
    df_int['duration'] = np.round(FractionEnergy * df_int['duration'], 3)  # replace full durations (T100) with T90
    # make the time segments of glitches measured
    # I reserve duration +/- gap around a GPS time of a glitch

    if UserDefinedDuration > 0:  # accept a user defined duration
        # assign this user defined duration for any types of glieches observed in a given IFO
         df_int.loc[(df_int['ifo'] == IFO), ['duration']] = UserDefinedDuration
        # df_int.loc[(df_int['ifo'] == IFO) & (df_int['label'] == 'null'), ['duration']] = UserDefinedDuration
    else:  # otherwise, keep the duration that are determined by an event trigger generator (ETG)
        pass

    df_int = df_int.assign(reserved_time_start=df_int['GPStime'] - df_int['duration'] / 2. - df_int['gap'])
    df_int = df_int.assign(reserved_time_end=df_int['GPStime'] + df_int['duration'] / 2. + df_int['gap'])

    # data set of randomly chosen background
    df_glitch = df_int[df_int.label == 'null']  # synthetic background trigger has SNR of zero


    # exclude glitches with SNR <= BGSNR from df_int as they are quiet enough to be considered as background
    df_int = df_int[df_int['snr'] > BGSNR_thre]

    print('all the glitches in the class: {}'.format(df_glitch.shape[0]))

    # the list of values satisfied as secure
    ListIndexSatisfied = []
    Listtarget_timeseries_start = []
    Listtarget_timeseries_end = []
    Listpre_background_start = []
    Listpre_background_end = []
    Listfol_background_start = []
    Listfol_background_end = []
    Listgpstime = []
    Listduration = []
    ListSNR = []
    Listconfi = []
    ListID = []
    for ind in range(len(df_glitch)):

        # target glitch's peak GPS time and the duration
        gpstime = df_glitch['GPStime'].iloc[ind]  # a peak GPS time
        duration = df_glitch['duration'].iloc[ind]  # # a duration
        SNR = df_glitch['snr'].iloc[ind]  # snr
        confi = df_glitch['confidence'].iloc[ind]  # confidence level
        ID = df_glitch['id'].iloc[ind]  # ID
        gap_individual = df_glitch['gap'].iloc[ind]

        if duration > 0.02:  # accept the duration of glitch is greater than 0.02 sec
            pass  # go to the next line
        else:  # if the duration of a glitch is less than 0.01 sec
            continue  # go to the next iteration

        # the starting and ending time of the target glitch
        # at this moment I assume that the target glitch is distributed evenly around the GPS time
        target_timeseries_start = gpstime - duration / 2.
        target_timeseries_end = target_timeseries_start + duration

        # take the background
        # the background is around the target glitch
        # preceding background, which is a nearest neighbor segment before target_timeseries_start
        # following background, which is a nearest neighbor segment after target_timeseries_end

        background_duration = 64 # duration  # duration of both the proceeding and following background
        pre_background_start = target_timeseries_start - gap_individual - background_duration
        pre_background_end = pre_background_start + background_duration  # target_timeseries_start - gap_individual
        fol_background_start = target_timeseries_end + gap_individual
        fol_background_end = fol_background_start + background_duration  # target_timeseries_end + gap_individual + background_duration

        # security check of the preceding background
        # case 1 is where the background is coincident with the right edge of the reserved segment
        # case 2 is where the background is coincident with the left edge of the reserved segment
        # case 3 is where the reserved segment is inside of the background
        # case 4 is where the background is inside of the reserved segment
        # it violates the security of the background estimation if at least one of the cases occurs
        # i.e., pro_case1, 2, 3, and 4 should be zero

        case1 = len(df_int[(df_int['reserved_time_end'] > target_timeseries_start) & (df_int['reserved_time_end'] < target_timeseries_end)])  # case 1
        case2 = len(df_int[(df_int['reserved_time_start'] > target_timeseries_start) & (df_int['reserved_time_start'] < target_timeseries_end)])  # case 2
        case3 = len(df_int[(df_int['reserved_time_start'] > target_timeseries_start) & (df_int['reserved_time_end'] < target_timeseries_end)])  # case 3
        case4 = len(df_int[(df_int['reserved_time_start'] < target_timeseries_start) & (df_int['reserved_time_end'] > target_timeseries_end)])  # case 4

        if case1 == 0 and case2 == 0 and case3 == 0 and case4 == 0:
            print('{:.1f}, {:.1f}, {:.1f}'.format(pre_background_end - pre_background_start,
                                                  fol_background_end - fol_background_start, duration))
            ListIndexSatisfied.append(ind)  # append the index satisfied as secure
            Listtarget_timeseries_start.append(target_timeseries_start)
            Listtarget_timeseries_end.append(target_timeseries_end)
            Listpre_background_start.append(pre_background_start)
            Listpre_background_end.append(pre_background_end)
            Listfol_background_start.append(fol_background_start)
            Listfol_background_end.append(fol_background_end)
            Listgpstime.append(gpstime)
            Listduration.append(duration)
            ListSNR.append(SNR)
            Listconfi.append(confi)
            ListID.append(ID)
        else:
            continue

    # zipped list of segments
    Listsegments = list(
        zip(ListIndexSatisfied, Listtarget_timeseries_start, Listtarget_timeseries_end, Listpre_background_start,
            Listpre_background_end, Listfol_background_start, Listfol_background_end, Listgpstime, Listduration,
            ListSNR, Listconfi, ListID))
    return Listsegments


#-----------------------------------------------------------------------------------------------------------
# For "burn-in" phase
# The following functions are for making the background upper threshold for the target and null samples
#-----------------------------------------------------------------------------------------------------------


#===== select ====
[docs]def FindRadomlistPointsForBurnIn(state, IFO, Epoch_lt, number_samples, step, duration_max, outputMother_dir):
    '''
    description: create random time stamps with their durations are uniformly distributed in log10 of 0.02 sec to duration_max
        1. within an epoch, create  a list of synthetic points with randomly chosen with durations
        2. make pandas frame dataset
    USAGE: df = FindRadomlistPointsForBurnIn(state, IFO, Epoch_lt, number_samples, step, duration_max, outputMother_dir)

    :param state: IFO state {observing, nominal-lock}
    :param IFO: an observer {H1, L1}
    :param Epoch_lt: a list of epochs
    :param number_samples: number of samples picked up
    :param step: step of data points in sec
    :param duration_max: a maximum value of the duration in sec
    :param outputMother_dir: an output directory in witch the data set is in
    :return: df: synthetic random data points within an epoch
    '''
    # -----------------------------------------------------------------------------------------------------+
    # get a list of start and end GPS times of an epoch with a given state {'observing', 'nominal-lock'}   |
    # use IdentifyGlitch.FindObservingTimeSegments()                                                       |
    # -----------------------------------------------------------------------------------------------------+
    # get a glitch identifier object
    indentify = IdentifyGlitch()
    # get start time and end time of an epoch
    startT, endT = Epoch_lt[0][0], Epoch_lt[-1][-1]
    # get observing segments
    SegmentsMat, trigger_dir = indentify.FindObservingTimeSegments(IFO, startT, endT, outputMother_dir, state)
    if state == 'nominal-lcok':
        SegmentsMat[:,0] = SegmentsMat[:, 0] + 300 # remove 5 min after the observing mode begins
        SegmentsMat[:,1] = SegmentsMat[:, 1] - 60 # remove 1 min before the observing mode ends
    elif state == 'observing':
        pass
    total_seg_used = (SegmentsMat[:,1] - SegmentsMat[:,0]).sum()
    if number_samples > int(total_seg_used/float(step)): # the number of samples greater than the maximum number of possible samples
        number_samples = int(total_seg_used/float(step)) # set number of samples equal to the maximum number of possible samples
        print('User selected number of synthetic samples is greater than the maximum number of possible samples')
        print('Set the number of samples equal to the maximum number = {}'.format(number_samples))
    #-----------------------------------------------------+
    # generate synthetic list of center times of samples  |
    #-----------------------------------------------------+
    whole_list_center_times = np.array([])
    for start_seg_used, end_seg_used in SegmentsMat:
        length_seg_used = end_seg_used - start_seg_used # length of a segment in sec
        number_picked = int(number_samples * length_seg_used / float(total_seg_used)) # number of points to be selected in this segment
        # chop this segment by a given time step, then select "number_picked" points out of "length_seg_used/step" points, without duplication
        selected_index_center_times = np.random.choice(int(length_seg_used/float(step)), number_picked+1, replace=False) # 1 comes to pick samples a little more than the request at this moment as the sample size might be smaller than than the setting
        # calculate a list of center times
        list_center_times = np.array([start_seg_used + step * selected_index_center_time for selected_index_center_time in selected_index_center_times])
        # sort this list in ascendingly
        list_center_times = np.sort(list_center_times)
        # append this list of center times to the whole list
        whole_list_center_times = np.append(whole_list_center_times, list_center_times)
    #whole_list_center_times = whole_list_center_times[:number_samples] # make the sample size equal to the request
    # randomly choose the timestamps where the size of the sample equal to the request
    whole_list_center_times = np.random.choice(whole_list_center_times, number_samples, replace=False)
    # ascending order
    whole_list_center_times = np.sort(whole_list_center_times)
    #--------------------------------------+
    # generate synthetic list of durations |
    #--------------------------------------+
    duration_min = 0.02 # the minimum duration is set to 0.02 s. This is ok as in this program analysis glitches with duration gerater than 0.01 s
    gen_d = 10 ** np.random.uniform(np.log10(duration_min), np.log10(duration_max), number_samples)

    # make a pandas dataframe that is compatible with a standard format in my program
    df = pd.DataFrame()
    df = df.assign(GPStime=whole_list_center_times)
    df = df.assign(startGPStime=np.zeros(number_samples))
    df = df.assign(peakFreq=np.zeros(number_samples))
    df = df.assign(snr=np.zeros(number_samples))
    df = df.assign(amplitude=np.zeros(number_samples))
    df = df.assign(centralFreq=np.zeros(number_samples))
    df = df.assign(duration=gen_d)
    df = df.assign(bandwidth=np.zeros(number_samples))
    df = df.assign(chisq=np.zeros(number_samples))
    df = df.assign(chisqDof=np.zeros(number_samples))
    df = df.assign(confidence=np.zeros(number_samples))
    df = df.assign(id=[None] * number_samples)
    df = df.assign(ifo=['{}'.format(IFO)] * number_samples)
    df = df.assign(label=['null'] * number_samples)
    df = df.assign(imgUrl=[None] * number_samples)
    output_path = os.path.join(trigger_dir, 'RandomSample.csv')
    df.to_csv(output_path) # save the data sample
    df = pd.read_csv(output_path) # load the file
    df.loc[:, 'label'] = 'null' # this is py3 issue, the label of 'null' is read as NaN
    return df


[docs]def FindBGlistBurnIn(state, duration_max, number_trials, step, outputMother_dir, df, Epoch_lt, IFO, BGSNR_thre, UserDefinedDuration, gap):
    '''
    description: From the random time stamps created with FindRadomlistPointsForBurnIn(), select the subset in which their on-source windows do not overlap with any other glitches
    Note that if on-source window can be extended, it will do
        1. load glitch data set (.csv file)
        4. accept time stamps where their on-source windows do not coincide with any other glitches
        5. return the info of the accepted glitches
    USAGE: Listsegments =  FindBGlistBurnIn(state, duration_max, number_trials, step, outputMother_dir, df, Epoch_lt, IFO, BGSNR_thre, UserDefinedDuration, gap)

    :param df: GravitySpy meta data in pandas format
    :param Epochstart: starting time of an epoch
    :param Epochend:  end time of an epoch
    :param Commissioning_lt: commissioning time in list
    :param TargetGlitchClass: a target glitch class name (str)
    :param IFO: a type of interferometer (H1, L1, V1)  (str)
    :param BGSNR_thre: an upper threhold of SNR for background glitches, i.e., quiet enough ), float or int
    :param targetSNR_thre: a lower threshold of SNR for target glitches, float or int
    :param Confidence_thre: a threshold of confidence level (float or int )
    :param UpperDurationThresh: an upper bound of duration in sec (float or int)
    :param LowerDurationThresh: a lower bound of duration in sec (float or int)
    :param UserDefinedDuration: user defined duration of a glitch (float or int), 0 in default
    :param gap: a time gap between the target and the background segments in sec, 1 sec in default
    :param flag: 'Both' or 'Either' taking both backgronds or either the preceding or the following background, respectively to accept glitches
    :return: the list of parameters of glitches passing the above thresholds
        Listsegments contains of
            ListIndexSatisfied: a list of index of glitches
            Listtarget_timeseries_start: a list of a target glitch starting time
            Listtarget_timeseries_end: a list of a target glitch ending time
            Listpre_background_start: a list of preceding background starting time
            Listpre_background_end; a list of preceding background ending time
            Listfol_background_start: a list of following background starting time
            Listfol_background_end: a list of following background ending time
            Listgpstime: a list of GPS times
            Listduration: a list of durations
            ListSNR: a list of SNRs
            Listconfi: a list of confidence levels
            ListID: a list of IDs
    '''

    df_list = []
    # take the data in an Epoch
    for Epoch in Epoch_lt:
        StartT, EndT = Epoch[0], Epoch[1]
        df_epoch = df[(df['GPStime'] >= StartT) & (df['GPStime'] <= EndT)]
        df_list.append(df_epoch)
    df_int = pd.concat(df_list)

    # # data set containing only a target glitch class
    # try:  # gravity spy and omicron trigger mode
    #     df_target = df_int[(df_int['label'] == TargetGlitchClass) & (df_int['ifo'] == IFO) & (df_int['confidence'] >= Confidence_thre) & (df_int['snr'] >= targetSNR_thre) & (df_int['snr'] <= targetUpperSNR_thre) & (df_int['peakFreq'] >= TriggerPeakFreqLowerCutoff) & (df_int['peakFreq'] <= TriggerPeakFreqUpperCutoff) & (df_int['duration'] >= LowerDurationThresh) & (df_int['duration'] <= UpperDurationThresh)]
    # except KeyError:  # pyCBC mode as there is no column "peakFreq"
    #     df_target = df_int[(df_int['label'] == TargetGlitchClass) & (df_int['ifo'] == IFO) & (df_int['confidence'] >= Confidence_thre) & (df_int['snr'] >= targetSNR_thre) & (df_int['snr'] <= targetUpperSNR_thre) & (df_int['duration'] >= LowerDurationThresh) & (df_int['duration'] <= UpperDurationThresh)]

    # generate synthetic data points
    df_random = FindRadomlistPointsForBurnIn(state, IFO, Epoch_lt, number_trials, step, duration_max, outputMother_dir)
    df_int = df_int.append(df_random, ignore_index=True) # append the dataset of random points

    #### improvement for duration 2019-07-02 #############
    #### full durations are too long for capturing the glitch signature so that the solution is to choose T90
    ### In this situation, a gap that is manually chosen will be gap_new = gap_m + 0.05*T100
    ### This function is for getting quite segments so that I concervatively choose the longest fraction of duration, in default
    FractionEnergy = 1  # (100% of energy )
    # note (1 - FractionEnergy)/2. is the fraction of the one-sided off T90
    df_int = df_int.assign(gap=gap + (1 - FractionEnergy) / 2. * df_int['duration'])
    df_int['duration'] = np.round(FractionEnergy * df_int['duration'], 3)  # replace full durations (T100) with T90
    # make the time segments of glitches measured
    # I reserve duration +/- gap around a GPS time of a glitch

    if UserDefinedDuration > 0:  # accept a user defined duration
        # assign this user defined duration for any types of glieches observed in a given IFO
        df_int.loc[(df_int['ifo'] == IFO), ['duration']] = UserDefinedDuration
    else:  # otherwise, keep the duration that are determined by an event trigger generator (ETG)
        pass

    df_int = df_int.assign(reserved_time_start=df_int['GPStime'] - df_int['duration'] / 2. - df_int['gap'])
    df_int = df_int.assign(reserved_time_end=df_int['GPStime'] + df_int['duration'] / 2. + df_int['gap'])

    # data set of rondomly chosen background
    df_glitch = df_int[df_int.label == 'null']  # synthetic background trigger has SNR of zero


    # exclude glitches with SNR <= BGSNR from df_int as they are quiet enough to be considered as background
    df_int = df_int[df_int['snr'] > BGSNR_thre]

    print('all the glitches in the class: {}'.format(df_glitch.shape[0]))

    # the list of values satisfied as secure
    ListIndexSatisfied = []
    Listtarget_timeseries_start = []
    Listtarget_timeseries_end = []
    Listpre_background_start = []
    Listpre_background_end = []
    Listfol_background_start = []
    Listfol_background_end = []
    Listgpstime = []
    Listduration = []
    ListSNR = []
    Listconfi = []
    ListID = []
    for ind in range(len(df_glitch)):

        # target glitch's peak GPS time and the duration
        gpstime = df_glitch['GPStime'].iloc[ind]  # a peak GPS time
        duration = df_glitch['duration'].iloc[ind]  # # a duration
        SNR = df_glitch['snr'].iloc[ind]  # snr
        confi = df_glitch['confidence'].iloc[ind]  # confidence level
        ID = df_glitch['id'].iloc[ind]  # ID
        gap_individual = df_glitch['gap'].iloc[ind]

        if duration > 0.02:  # accept the duration of glitch is greater than 0.02 sec
            pass  # go to the next line
        else:  # if the duration of a glitch is less than 0.02 sec
            continue  # go to the next iteration

        # the starting and ending time of the target glitch
        # at this moment I assume that the target glitch is distributed evenly around the GPS time
        target_timeseries_start = gpstime - duration / 2.
        target_timeseries_end = target_timeseries_start + duration

        # take the background
        # the background is around the target glitch
        # preceding background, which is a nearest neighbor segment before target_timeseries_start
        # following background, which is a nearest neighbor segment after target_timeseries_end

        background_duration = 64 # duration  # duration of both the proceeding and following background
        pre_background_start = target_timeseries_start - gap_individual - background_duration
        pre_background_end = pre_background_start + background_duration  # target_timeseries_start - gap_individual
        fol_background_start = target_timeseries_end + gap_individual
        fol_background_end = fol_background_start + background_duration  # target_timeseries_end + gap_individual + background_duration

        # security check of the preceding background
        # case 1 is where the background is coincident with the right edge of the reserved segment
        # case 2 is where the background is coincident with the left edge of the reserved segment
        # case 3 is where the reserved segment is inside of the background
        # case 4 is where the background is inside of the reserved segment
        # it violates the security of the background estimation if at least one of the cases occurs
        # i.e., pro_case1, 2, 3, and 4 should be zero

        case1 = len(df_int[(df_int['reserved_time_end'] > target_timeseries_start) & (df_int['reserved_time_end'] < target_timeseries_end)])  # case 1
        case2 = len(df_int[(df_int['reserved_time_start'] > target_timeseries_start) & (df_int['reserved_time_start'] < target_timeseries_end)])  # case 2
        case3 = len(df_int[(df_int['reserved_time_start'] > target_timeseries_start) & (df_int['reserved_time_end'] < target_timeseries_end)])  # case 3
        case4 = len(df_int[(df_int['reserved_time_start'] < target_timeseries_start) & (df_int['reserved_time_end'] > target_timeseries_end)])  # case 4



        if case1 == 0 and case2 == 0 and case3 == 0 and case4 == 0:
            try:
                # smallest time gap between the start of a target reserved time and the nearest reserved time of others that are before the target
                neareset_gap_pre = (target_timeseries_start - df_int[df_int['reserved_time_end'] < target_timeseries_start]['reserved_time_end']).sort_values()[:1].values[0]
                if neareset_gap_pre > background_duration - 5: # # if the smallest gap is larger than the background duration
                    neareset_gap_pre = background_duration - 5 # replace it
                else:
                    pass
                target_timeseries_start = target_timeseries_start - neareset_gap_pre  # replace

            except IndexError:  # there is no nearest glitch that satisfies the above condition
                pass

            try:
                # smallest time gap between the end of a target reserved time and the nearest reserved time of others that are after the target
                neareset_gap_fol = (df_int[df_int['reserved_time_start'] > target_timeseries_end]['reserved_time_start'] - target_timeseries_end).sort_values()[:1].values[0]
                if neareset_gap_fol > background_duration - 5: # # if the smallest gap is larger than the background duration
                    neareset_gap_fol = background_duration -5 # replace it
                else:
                    pass
                target_timeseries_end = target_timeseries_end + neareset_gap_fol  # replace
            except IndexError:  # there is no nearest glitch that satisfies the above condition
                pass

            # update duration
            duration = round(target_timeseries_end - target_timeseries_start, 3)

            # #print('{:.1f}, {:.1f}, {:.1f}'.format(neareset_gap_pre, neareset_gap_fol, duration))
            print('{:.1f}, {:.1f}, {:.1f}'.format(pre_background_end - pre_background_start, fol_background_end - fol_background_start, duration))


            ListIndexSatisfied.append(ind)  # append the index satisfied as secure
            Listtarget_timeseries_start.append(target_timeseries_start)
            Listtarget_timeseries_end.append(target_timeseries_end)
            Listpre_background_start.append(pre_background_start)
            Listpre_background_end.append(pre_background_end)
            Listfol_background_start.append(fol_background_start)
            Listfol_background_end.append(fol_background_end)
            Listgpstime.append(gpstime)
            Listduration.append(duration)
            ListSNR.append(SNR)
            Listconfi.append(confi)
            ListID.append(ID)
        else:
            continue

    # zipped list of segments
    Listsegments = list(
        zip(ListIndexSatisfied, Listtarget_timeseries_start, Listtarget_timeseries_end, Listpre_background_start,
            Listpre_background_end, Listfol_background_start, Listfol_background_end, Listgpstime, Listduration,
            ListSNR, Listconfi, ListID))
    return Listsegments


# ==== save ====
[docs]def clean_duration_for_asd(duration):
    '''
    description:
        This function is used for clean the dicimal points in value of duration to avoid the error shown in ASD estimator in gwpy
    :param duration: duration in sec

    :return: duration: cleaned duration in sec
    '''
    fourth_decimal_num_of_duration = (duration * 1000 - floor(duration * 1000)) * 10  # fourth decimal number
    duration = duration - fourth_decimal_num_of_duration * 0.0001  # truncate at the fourth decimal number
    # revise the duration to make an error in whitening process not not to occur
    # ValueError occurs in the whitening when the lengths of whitened data, whitening data and the filter are different
    third_decimal_num_of_duration = round((round(duration, 3) - round(duration, 2)) * 1000)
    if third_decimal_num_of_duration % 2 == 1:  # odd number at the third decimal point
        duration = duration - 0.001  # make the duration with an even number at the third decimal point
    elif third_decimal_num_of_duration % 2 == 0:  # even number at the third decimal point
        pass  # keep the original duration as it is
        duration = round(duration, 3)  # round it up at 3rd decimal point as the above three lines creates small number below the 3rd decimal point
    return duration


[docs]def Multiprocess_whitening_for_burn_in(full_timeseries, target_timeseries_start, target_timeseries_end, array_dummy_duration):
    '''
    description:
        This is used for normalizing the spectrum of each trial in the on-source window of random time stamps created with FindBGlistBurnIn()
        iterate each trial for each dummy on
        1. from the time series of a single channel
        2. calculate how many trials are available per dummy on-source window within the extended on-source window
        3. iterate trials per dummy on-source window
    USAGE: list_whitened_fft, sample_rate, DURATION, list_num_trial_used = Multiprocess_whitening_for_burn_in(full_timeseries, target_timeseries_start, target_timeseries_end, array_dummy_duration)

    :param full_timeseries: time series comprising target and BGs
    :param target_timeseries_start: a start time of a target segment
    :param target_timeseries_end: an end time of a target segment
    :param array_dummy_duration: numpy array of dummy on-source windows

    :return:    list_whitened_fft: list of numpy arrays of the nomalized spectrum where each element of this list is the normalized spectrum for each trial with a given dummy on-source window.
                                    These spectrums are concatenated to a vector from left to right, e.g, np.array([sp0_try0, sp1_try0, ..., sp0_try1, sp0_try1, ....   ])
                                    Hence, this list is [ (sp for dummy 0), (sp for dummy 1), (....), ....]
                sample_rate: sampling rate of this channel
                DURATION: a duration of a target segment
                list_num_trial_used: a list of trials per dummy on-source window. Note the number of trials per dummy on-source vary becuase of the limited length of the extended total on-source window.
                                    The longer the dummy on-source, the fewer the trials are available
    '''
    sample_rate = full_timeseries.sample_rate.value
    DURATION = target_timeseries_end - target_timeseries_start
    fourth_decimal_num_of_duration = (DURATION * 1000 - floor(DURATION * 1000)) * 10  # fourth decimal number
    DURATION = DURATION - fourth_decimal_num_of_duration * 0.0001  # truncate at the fourth decimal number
    # revise the duration to make an error in whitening process not not to occur
    # ValueError occurs in the whitening when the lengths of whitened data, whitening data and the filter are different
    third_decimal_num_of_duration = round((round(DURATION, 3) - round(DURATION, 2)) * 1000)
    if third_decimal_num_of_duration % 2 == 1:  # odd number at the third decimal point
        DURATION = DURATION - 0.001  # make the duration with an even number at the third decimal point
    elif third_decimal_num_of_duration % 2 == 0:  # even number at the third decimal point
        pass  # keep the original duration as it is
    DURATION = round(DURATION, 3)  # round it up at 3rd decimal point as the above three lines creates small number below the 3rd decimal point
    # get the on-source time series
    on_source = full_timeseries.crop(start=target_timeseries_start, end=target_timeseries_end)
    on_source_start = on_source.span[0]
    on_source_end = on_source.span[1]
    full_timeseries_start = full_timeseries.span[0]
    full_timeseries_end = full_timeseries.span[1]
    # average background ASD
    length_for_ASD = 128 # in sec
    timeseries_for_asd = full_timeseries.crop(end=full_timeseries_start+length_for_ASD)
    list_whitened_fft = []
    list_num_trial_used = []
    for dummy_duration in array_dummy_duration: # iterate dummy on-source window
        dummy_duration = clean_duration_for_asd(dummy_duration)
        # number of BG trials
        iterate = int(DURATION / dummy_duration)
        ## --------cropping from the starting time------
        # take some fraction of trials
        list_index_trials = select_some_trials(iterate, maximum_iterate=10)
        crop_start_list = [on_source_start + i * dummy_duration for i in list_index_trials]  # a list of starting times cropped segments
        crop_end_list = [crop_start + dummy_duration for crop_start in crop_start_list]  # a list of ending times of cropped segments
        # avoid the cropping warning
        crop_end_list = [crop_end if crop_end < on_source_end else on_source_end for crop_end in crop_end_list]  # if end time is replaced with the end of the BG segments if it is larger
        # note: fft_PBG = pre_background.crop(start=crop_start, end=crop_end).average_fft(DURATION, window='hanning').abs().value # crop and fft of it
        ave_background_asd = timeseries_for_asd.asd(dummy_duration, 0.5 * dummy_duration, method='median').value
        if (timeseries_for_asd.value == 0).all() is True: # if the time series is all zero
            whitened_fft_ind_list = [np.zeros(ave_background_asd.shape[0]) for crop_start, crop_end in zip(crop_start_list, crop_end_list)]
        elif ave_background_asd[ave_background_asd != 0].shape[0] == 0:  # if the time series is all zero
            whitened_fft_ind_list = [np.zeros(ave_background_asd.shape[0]) for crop_start, crop_end in zip(crop_start_list, crop_end_list)]
        else:
            # replace value of elements equal to zero with non-zero nearest neighbor
            ave_background_asd_new = ave_background_asd.copy()
            zero_indecies = np.where(ave_background_asd == 0)[0]
            nonzero_indecies = np.where(ave_background_asd != 0)[0]
            if zero_indecies.shape[0] == 0: # if everything is non-zero
                pass
            # replace value of elements equal to zero with non-zero nearest neighbor
            else: # if some of them are zero
                for zero_ind in zero_indecies: #
                    # nearest index with non-zero value
                    k = np.argsort(np.abs((zero_ind - nonzero_indecies)))[0]
                    ave_background_asd_new[zero_ind] = ave_background_asd[nonzero_indecies[k]]
            # calculate each whitened FFT
            whitened_fft_ind_list = [on_source.crop(start=crop_start, end=crop_end).asd(dummy_duration, 0.5 * dummy_duration, method='median').value / ave_background_asd_new for crop_start, crop_end in zip(crop_start_list, crop_end_list)]
        del crop_start_list, crop_end_list  # delete redundant lists
        whitened_fft = np.concatenate(whitened_fft_ind_list, axis=None)
        del whitened_fft_ind_list  # delete redundant a list
        # fill
        list_whitened_fft.append(whitened_fft)
        # number of trials used
        num_trial_used = len(list_index_trials)
        # fill
        list_num_trial_used.append(num_trial_used)
    return list_whitened_fft, sample_rate, DURATION, list_num_trial_used


[docs]def SaveDummyTargetHDF5_OFFLINE_burn_in(Listsegments, re_sfchs, IFO, outputpath, outputfilename, number_process, PlusHOFT='False', trial_duration_sample=20):
    '''
    description:
        Save the normalized spectrum for every channels for every glitches, with the use of Multiprocess_whitening_for_burn_in()
        1. iterate all the samples
        2. whiten the on-source window for every channels
        4. save the whitened spectrum as a HDF5 file
    each group in the HDF5 is for a single channel and each of datasets per group has the normalized spectrum for a dummy on-source window

    USAGE: SaveTargetAndBackGroundHDF5_OFFLINE_burn_in(Listsegments, re_sfchs, IFO, outputpath, outputfilename, number_process, PlusHOFT='False')

    :param Listsegments: a list of segment parameters
    :param re_sfchs: a list of safe channels
    :param outputpath: a directory of an output file
    :param outputfilename: a name of an output file
    :param number_process: a number of processes in parallel
    :param PlusHOFT: whether to get data of hoft, {'True' or 'False'}, 'False' in default
    :param trial_duration_sample: a number of dummy on-source
    :return None
    '''
    # check if the output directory exists
    if os.path.isdir(outputpath) is False:
        print("The directory for the output HDF5 file does not exist: path = {}".format(outputpath))
        print("Please specify the existing directory for it.")
        sys.exit()
    # the list of the safe channels
    channels = ['{0}:{1}'.format(IFO, ch) for ch in re_sfchs]
    # the path and the file name
    outputPathFileName = os.path.join(outputpath, outputfilename)
    # make an output file
    try:
        f = h5py.File(outputPathFileName, 'w-')
        f.close()
    except IOError:
        print('The file already exists in {}'.format(outputPathFileName))
        print('Please choose a different file name in a configuration file or delete this file if you want to overwrite it')
        sys.exit()
    # remove the astro cashe
    if os.path.isfile('~/.astro/cashe/downlowd/py3/lock') is True:
        os.remove('~/.astro/cashe/downlowd/py3/lock')
    # open a HDF5 file in writing mode
    with h5py.File(outputPathFileName, 'r+') as f:
        for IndexSatisfied, target_timeseries_start, target_timeseries_end, pre_background_start, pre_background_end, fol_background_start, fol_background_end, gpstime, duration, SNR, confi, ID in Listsegments:
            t0 = time.time()
            try: # deal with time series in bad state
                if pre_background_start != 0 and pre_background_end != 0 and fol_background_start != 0 and fol_background_end != 0:  # if there is an available proceeding background and if there is an available following background
                    full_timeseriesDict = TimeSeriesDict.find(channels, pre_background_start, fol_background_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process)
                elif pre_background_start != 0 and pre_background_end != 0 and fol_background_start == 0 and fol_background_end == 0: # if there is an available proceeding background and if there is NO available following background
                    full_timeseriesDict = TimeSeriesDict.find(channels, pre_background_start, target_timeseries_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process)
                elif pre_background_start == 0 and pre_background_end == 0 and fol_background_start != 0 and fol_background_end != 0: # if there is NO available proceeding background and if there is an available following background
                    full_timeseriesDict = TimeSeriesDict.find(channels, target_timeseries_start, fol_background_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process)
                else:
                    pass
                channels_used = channels
                if PlusHOFT == 'True': # get the data of hoft
                    if pre_background_start != 0 and pre_background_end != 0 and fol_background_start != 0 and fol_background_end != 0:  # if there is an available proceeding background and if there is an available following background
                        full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], pre_background_start, fol_background_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process)
                    elif pre_background_start != 0 and pre_background_end != 0 and fol_background_start == 0 and fol_background_end == 0:  # if there is an available proceeding background and if there is NO available following background
                        full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], pre_background_start, target_timeseries_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process)
                    elif pre_background_start == 0 and pre_background_end == 0 and fol_background_start != 0 and fol_background_end != 0:  # if there is NO available proceeding background and if there is an available following background
                        full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], target_timeseries_start, fol_background_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process)
                    else:
                        pass
                    full_timeseriesDict.append(full_timeseriesDict_hoft) # append
                    del full_timeseriesDict_hoft # delete the hoft object
                    channels_used = np.append(channels, '{}:GDS-CALIB_STRAIN'.format(IFO))
                # =====================================================
            except IndexError:
                print('Bad data, go to a next GPS')
                continue # go to a next GPS time
            except RuntimeError:
                print('RuntimeError, go to a next GPS')
                continue
            except IOError: # error when getting time series
                print('IOError, go to a next GPS')
                continue
            except ValueError: # ValueError: Failed to read /hdfs/frames/O3/raw/H1/H-H1_R-12384/H-H1_R-1238489600-64.gwf: Unable to map from the stream class id (0) to internal class id at position: 18446744073709551601 [streamref: class: 0 instance: 0 length: 0 ]
                print('ValueError, go to a next GPS')
                continue
            g = f.create_group('gps{:05d}'.format(IndexSatisfied))
            g.attrs['GPS'] = gpstime # a peak GPS time
            g.attrs['duration'] = duration
            g.attrs['snr'] = SNR # snr
            g.attrs['confidence'] = confi # confidence
            g.attrs['id'] = ID
            g.attrs['ifo'] = channels[0].split(':')[0]
            # total on-source window
            on_source_duration = target_timeseries_end - target_timeseries_start
            # duration on trial on-source window
            #------------------------------------------------------------+
            # randomly choose dummy on-source windows uniform in log10   |
            #------------------------------------------------------------+
            duration_min = 0.02  # the minimum duration is set to 0.02 s. This is ok as in this program analysis glitches with duration gerater than 0.02 s
            duration_max = 15
            if duration_max > on_source_duration:
                duration_max = on_source_duration
            array_dummy_duration = np.round(10 ** np.random.uniform(np.log10(duration_min), np.log10(duration_max), trial_duration_sample), 3)
            try:
                ch_labels = range(len(channels_used)) # a list of channel labels
                list_array_dummy_duration = []
                List_list_whitened_fft = [] # a list of whitened fft of target
                List_sample_rate = [] # a list of sample rates
                List_DURATION = []
                list_list_num_trial_used = []
                # whitened target and BG segments with multi processes
                with concurrent.futures.ProcessPoolExecutor(max_workers=number_process) as executor:
                    for list_whitened_fft, sample_rate, DURATION, list_num_trial_used in executor.map(Multiprocess_whitening_for_burn_in,
                                                                                                       full_timeseriesDict.values(),
                                                                                                       itrtls.repeat(target_timeseries_start),
                                                                                                       itrtls.repeat(target_timeseries_end),
                                                                                                       itrtls.repeat(array_dummy_duration)
                                                                                             ):
                        List_list_whitened_fft.append(list_whitened_fft) # fill
                        List_sample_rate.append(sample_rate) # fill
                        List_DURATION.append(DURATION) # fill
                        list_list_num_trial_used.append(list_num_trial_used) # fill
                for ch_label in ch_labels: # iterate through channel labels
                    sub_g = g.create_group("ch{:05d}".format(ch_label))
                    sub_g.attrs['SamplingRate'] = List_sample_rate[ch_label]  # target
                    sub_g.attrs['channel'] = channels_used[ch_label].split(':')[1]  # target
                    for dummy_duration_index in range(len(array_dummy_duration)):
                        dset = sub_g.create_dataset("dummy{:05d}".format(dummy_duration_index), data=List_list_whitened_fft[ch_label][dummy_duration_index], compression="gzip", compression_opts=9)  # target
                        dset.attrs['SamplingRate'] = List_sample_rate[ch_label]  # target
                        dset.attrs['duration'] = array_dummy_duration[dummy_duration_index]  # target
                        dset.attrs['num_trial'] = list_list_num_trial_used[ch_label][dummy_duration_index]  # target
            except ValueError:
                del f['gps{:05d}'.format(IndexSatisfied)]
                print('*** delete group gps{:05d} due to ASD estimation error ***'.format(IndexSatisfied))
                continue # go to a next GPS time
            except IOError: # IOError: Driver write request failed
                del f['gps{:05d}'.format(IndexSatisfied)]
                print('*** delete group gps{:05d} due to memory error ***'.format(IndexSatisfied))
                continue # go to a next GPS time
            print('Complete saving a sample as gps{:05d}'.format(IndexSatisfied))
            t1 = time.time()
            print('Time taken: {} s'.format(t1 - t0))


# ====== save for  multi-frequency bands search =====

[docs]def BG_upper_threshold_single_channel_given_freqband(list_dummy_duration, list_whitened_fft, sampling_rate, list_num_trial_used, sigma, LowerCutOffFreq='None', UpperCutOffFreq='None'):
    '''
    description:
        For a single channel per glitch, this function calculates a list of the background upper threshold across dummy on-source windows
    USAGE: list_bg_upper_threshold = BG_upper_threshold_single_channel_given_freqband(list_dummy_duration, list_whitened_fft, sampling_rate, list_num_trial_used, sigma, LowerCutOffFreq='None', UpperCutOffFreq='None')

    :param list_dummy_duration: a list of dummy on-source window
    :param list_whitened_fft: a list of the normalized spectrums where each element is a spectrum for a given dummy on-source window
    :param sampling_rate: sampling rate of a channel
    :param list_num_trial_used: a list of trials of dummy on-source window within the total on-source window
    :param sigma: an integer to determine the upper bound of the off-source window
    :param LowerCutOffFreq: a lower cutoff frequency
    :param UpperCutOffFreq: an upper cutoff frequency
    :return: Count: importance
    '''
    list_bg_upper_threshold = np.array([])
    for dummy_duration_index, dummy_duration in enumerate(list_dummy_duration, 0):
        sp = list_whitened_fft[dummy_duration_index]
        num_trial_used = list_num_trial_used[dummy_duration_index]
        try: # test if it is an array
            sp[0]
        except ValueError: # if it is None
            sp = 0 # assign an integer
        #=======================
        if type(sp) != int:
            sp = sp.reshape(int(num_trial_used), int(len(sp)/num_trial_used)) # reshape it
        else:
            sp = np.array([[0]])  # a trick to be used in the line, if np.isnan(sp[0]) != True :
        #=======================
        #++++++ for the case where there is no data measured at a channel +++++++++++++++++++
        #++++++ That channel has no contribution to h(t) so that we can define Count = 0 +++++++++++
        #++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
        if np.isnan(sp[:,0]).any() != True :  # if sp has data measured
            # low, high, or band pass filter if we know which frequency band is important
            if LowerCutOffFreq != 'None' and UpperCutOffFreq == 'None': # high-pass filter
                ## pT, pP, and pF should be zero if LowerCutOffFreq is greater than the half of the sampling rate as there is no data. However pT would be non-zero due to the bad resolution (a very short giltch)
                if sampling_rate / 2 <= LowerCutOffFreq:
                    sp = np.array([[0]])
                else:
                    LowerCutOffElement = int(LowerCutOffFreq * dummy_duration)
                    if LowerCutOffElement <= sp.shape[-1]: # if LowCutOffElement is within the length of the vector
                        sp = sp[:, LowerCutOffElement:]
                    else: # if LowerCutOffElement is greater than the length of the vector
                        sp = np.array([[0]])
            elif LowerCutOffFreq == 'None' and UpperCutOffFreq != 'None': # low-pass filter
                # even if UpperCutOffElement is greater than the length of the vector is works fine
                UpperCutOffElement = int(UpperCutOffFreq * dummy_duration)
                sp = sp[:, :UpperCutOffElement]
            elif LowerCutOffFreq != 'None' and UpperCutOffFreq != 'None':  # band pass filter
                ## pT, pP, and pF should be zero if LowerCutOffFreq is greater than the half of the sampling rate as there is no data. However pT would be non-zero due to the bad resolution (a very short giltch)
                if sampling_rate / 2 <= LowerCutOffFreq:
                    sp = np.array([[0]])
                else:
                    LowerCutOffElement = int(LowerCutOffFreq * dummy_duration)
                    UpperCutOffElement = int(UpperCutOffFreq * dummy_duration)
                    if LowerCutOffElement <= sp.shape[-1]: # if LowCutOffElement is within the length of the vector
                        sp = sp[:, LowerCutOffElement:UpperCutOffElement]
                    else: # if LowCutOffElement is greater than the length of the vector
                        sp = np.array([[0]])
            else:
                pass

            if sp[0,0] == 0:
                GausCeiling = 0
            else:
                GausCeiling = sp.mean() + sigma * sp.std(ddof=1) # gaussian noise ceiling
        else:
            GausCeiling = 0 # manually put zero because this channel has no contribution at this glitch ID
        list_bg_upper_threshold = np.append(list_bg_upper_threshold, GausCeiling)
    return list_bg_upper_threshold


[docs]def BG_upper_threshold_single_channel_multiband(list_dummy_duration, list_whitened_fft, sampling_rate, list_num_trial_used, sigma):
    '''
    description:
        calculate values of the background upper threshold per dummy on-source window and per frequency band
    USAGE: MatBGUpperThresh = BG_upper_threshold_single_channel_multiband(list_dummy_duration, list_whitened_fft, sampling_rate, list_num_trial_used, sigma)

    :param list_dummy_duration: a list of dummy on-source window
    :param list_whitened_fft: a list of the normalized spectrums where each element is a spectrum for a given dummy on-source window
    :param sampling_rate: sampling rate of a channel
    :param list_num_trial_used: a list of trials of dummy on-source window within the total on-source window
    :param sigma: an integer to determine the upper bound of the off-source window
    :return: MatBGUpperThresh: values of the background upper threshold, numpy array, where the frequency bands are rows from the top to bottom, the dummy on-source windows are in columns from left to right
    '''
    # iterate different frequency bands in the list

    MatBGUpperThresh = np.array([])
    for lower_freq_cut, upper_freq_cut in Const.freq_bands:
        list_bg_upper_threshold = BG_upper_threshold_single_channel_given_freqband(list_dummy_duration, list_whitened_fft, sampling_rate,
                                                                                   list_num_trial_used, sigma, LowerCutOffFreq=lower_freq_cut, UpperCutOffFreq=upper_freq_cut)
        list_bg_upper_threshold = list_bg_upper_threshold.reshape(1, list_bg_upper_threshold.shape[-1])
        MatBGUpperThresh = np.vstack([MatBGUpperThresh, list_bg_upper_threshold]) if MatBGUpperThresh.size else list_bg_upper_threshold
    return MatBGUpperThresh


[docs]def CreateBGUpperThresh_single_channel_multiband(full_timeseries, target_timeseries_start, target_timeseries_end, array_dummy_duration, sigma):
    '''
    discription:
        1.  make list_whitened_fft: list of numpy arrays of the nomalized spectrum where each element of this list is the normalized spectrum for each trial with a given dummy on-source window.
                                    These spectrums are concatenated to a vector from left to right, e.g, np.array([sp0_try0, sp1_try0, ..., sp0_try1, sp0_try1, ....   ])
                                    Hence, this list is [ (sp for dummy 0), (sp for dummy 1), (....), ....]
            sample_rate: sampling rate of this channel
            DURATION: a duration of a target segment
            list_num_trial_used: a list of trials per dummy on-source window. Note the number of trials per dummy on-source vary becuase of the limited length of the extended total on-source window.
                                The longer the dummy on-source, the fewer the trials are available
        2. calculate values of the background upper threshold per dummy on-source window and per frequency band
    USAGE: MatBGUpperThresh, sample_rate = CreateBGUpperThresh_single_channel_multiband(full_timeseries, target_timeseries_start, target_timeseries_end, array_dummy_duration, sigma)

    :param full_timeseries: the full time series in gwpy object including on- and off source windows
    :param target_timeseries_start: the start time of the on-source window
    :param target_timeseries_end: the end time of the on-source window
    :param array_dummy_duration: numpy array of dummy on-source windows
    :param sigma: an interger to calculate the value of importance
    :return:
        MatBGUpperThresh: values of the background upper threshold, numpy array, where the frequency bands are rows from the top to bottom, the dummy on-source windows are in columns from left to right
        sample_rate: a sampling rate of a single channel
    '''
    # calculate the whitened FFT of the on- and off-source windows
    list_whitened_fft, sample_rate, DURATION, list_num_trial_used = Multiprocess_whitening_for_burn_in(full_timeseries, target_timeseries_start, target_timeseries_end, array_dummy_duration)
    # calculate the importance
    MatBGUpperThresh = BG_upper_threshold_single_channel_multiband(array_dummy_duration, list_whitened_fft, sample_rate, list_num_trial_used, sigma)
    return MatBGUpperThresh, sample_rate


[docs]def CreateAllChannels_BGUpperThresh_multband(Listsegment, IFO, re_sfchs, number_process, PlusHOFT, sigma, duration_max, trial_duration_sample):
    '''
    description:
        1. use a single glitch time
        2. query timeseries of all the channels around a glitch
        3. calculate values of the background upper threshold per dummy on-source window per frequency band
        4. iterate through channels

    USAGE: IndexSatisfied, list_mat_BG_upper_thresh, array_dummy_duration, list_sample_rates, re_sfchs, gpstime, duration, SNR, confi, ID = CreateAllChannels_BGUpperThresh_multband(Listsegment, IFO, re_sfchs, number_process, PlusHOFT, sigma, duration_max = 15, trial_duration_sample = 20)

    :param Listsegment: a list of segment parameters
    :param IFO
    :param re_sfchs: a list of safe channels
    :param number_process: a number of processes in parallel
    :param PlusHOFT: whether to get data of hoft, {'True' or 'False'}
    :param sigma: an integer to be used for calculating values of importance
    :param maximum value of dummy on-sourc window in sec
    :param a number of dummy on-source windows within the total on-source window
    :return:
        IndexSatisfied: glitch index
        list_mat_BG_upper_thresh: a list of matrices per channel where element of a matrix is a value of the background upper threshold, numpy array, where the frequency bands are rows from the top to bottom, the dummy on-source windows are in columns from left to right
        array_dummy_duration: numpy array of the dummy on-source windows
        list_sample_rates: a list of sampling rates of channels, numpy array
        re_sfchs: list of channels without "IFO:" at the beginning
        gpstime: a gps time
        duration: a value of duration
        SNR: signal to noise ratio in the h(t)
        conf: a confidence level of classification of a glitch, provided by Gravity Spy. Otherwise None
        ID: a glitch ID, provided by Gravity Spy in usual
    '''
    channels = ['{0}:{1}'.format(IFO, ch) for ch in re_sfchs]
    IndexSatisfied, target_timeseries_start, target_timeseries_end, pre_background_start, pre_background_end, fol_background_start, fol_background_end, gpstime, duration, SNR, confi, ID = Listsegment
    t0 = time.time()
    # extracting the target data and the proceeding and following backgrounds
    try: # deal with time series in bad state
        if pre_background_start != 0 and pre_background_end != 0 and fol_background_start != 0 and fol_background_end != 0:  # if there is an available proceeding background and if there is an available following background
            full_timeseriesDict = TimeSeriesDict.find(channels, pre_background_start, fol_background_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process)
        elif pre_background_start != 0 and pre_background_end != 0 and fol_background_start == 0 and fol_background_end == 0: # if there is an available proceeding background and if there is NO available following background
            full_timeseriesDict = TimeSeriesDict.find(channels, pre_background_start, target_timeseries_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process)
        elif pre_background_start == 0 and pre_background_end == 0 and fol_background_start != 0 and fol_background_end != 0: # if there is NO available proceeding background and if there is an available following background
            full_timeseriesDict = TimeSeriesDict.find(channels, target_timeseries_start, fol_background_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process)
        else:
            pass
        if PlusHOFT == 'True': # get the data of hoft
            if pre_background_start != 0 and pre_background_end != 0 and fol_background_start != 0 and fol_background_end != 0:  # if there is an available proceeding background and if there is an available following background
                full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], pre_background_start, fol_background_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process)
            elif pre_background_start != 0 and pre_background_end != 0 and fol_background_start == 0 and fol_background_end == 0:  # if there is an available proceeding background and if there is NO available following background
                full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], pre_background_start, target_timeseries_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process)
            elif pre_background_start == 0 and pre_background_end == 0 and fol_background_start != 0 and fol_background_end != 0:  # if there is NO available proceeding background and if there is an available following background
                full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], target_timeseries_start, fol_background_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process)
            else:
                pass
            full_timeseriesDict.append(full_timeseriesDict_hoft) # append
            del full_timeseriesDict_hoft # delete the hoft object
            re_sfchs = np.append(re_sfchs, 'GDS-CALIB_STRAIN')
    except IndexError as err:
        print("IndexError: {}".format(err.message))
        # IndexError: After reading object of type: FrAdcData stream is no longer in good state. ( Frmae: 0 ChannelType: L1:ASC-X_TR_A_NSUM_OUT_DQ offset: 932092099)
        return 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
    except RuntimeError as err:
        print("RuntimeError: {}".format(err.message))
        return 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
    except IOError as err: # error when getting time series
        print('IOError: {}'.format(err.message))
        return 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
    except ValueError: # ValueError: Failed to read /hdfs/frames/O3/raw/H1/H-H1_R-12384/H-H1_R-1238489600-64.gwf: Unable to map from the stream class id (0) to internal class id at position: 18446744073709551601 [streamref: class: 0 instance: 0 length: 0 ]
        print('ValueError')
        return 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
    try:
        # total on-source window
        on_source_duration = target_timeseries_end - target_timeseries_start
        # duration on trial on-source window
        # -------------------------------------------+
        # randomly choose dummy on-source windows   |
        # -------------------------------------------+
        duration_min = 0.02  # the minimum duration is set to 0.02 s. This is ok as in this program analysis glitches with duration gerater than 0.02 s
        if duration_max > on_source_duration:
            duration_max = on_source_duration
        array_dummy_duration = np.round(10 ** np.random.uniform(np.log10(duration_min), np.log10(duration_max), trial_duration_sample), 3)
        list_mat_BG_upper_thresh = [] # a list of rho
        list_sample_rates = np.array([])
        # whitened target and BG segments with multi processes
        with concurrent.futures.ProcessPoolExecutor(max_workers=number_process) as executor:
            for MatBGUpperThresh, sample_rate in executor.map(CreateBGUpperThresh_single_channel_multiband,
                                                              full_timeseriesDict.values(),
                                                              itrtls.repeat(target_timeseries_start),
                                                              itrtls.repeat(target_timeseries_end),
                                                              itrtls.repeat(array_dummy_duration),
                                                              itrtls.repeat(sigma)
                                                              ):
                # fill
                list_mat_BG_upper_thresh.append(MatBGUpperThresh)
                list_sample_rates = np.append(list_sample_rates, sample_rate)
    except ValueError:
        print('*** delete group gps{:05d} due to ASD estimation error ***'.format(IndexSatisfied))
        return 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
    except IOError: # IOError: Driver write request failed
        print('*** delete group gps{:05d} due to memory error  ***'.format(IndexSatisfied))
        return 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
    t1 = time.time()
    print('Time taken: {} s'.format(t1 - t0))
    return IndexSatisfied, list_mat_BG_upper_thresh, array_dummy_duration, list_sample_rates, re_sfchs, gpstime, duration, SNR, confi, ID  # pass the parameters


[docs]def SaveUppperThreshodBG_multiband_OFFLINE_burn_in(Listsegments, re_sfchs, IFO, outputpath, outputfilename, number_process, sigma, PlusHOFT='False', duration_max=15, trial_duration_sample=20):
    '''
    description:
        THIS IS USED FOR "OFFLINE" MODE
        1. iterate glitch samples
        2. get values of the background upper threshold per dummy on-source window per frequency band for each of channels
        4. save the values
        Note this depends on
    USAGE: SaveUppperThreshodBG_multiband_OFFLINE(Listsegments, re_sfchs, IFO, outputpath, outputfilename, number_process, sigma, PlusHOFT='False', duration_max = 15, trial_duration_sample = 20)

    :param Listsegments: a list of segment parameters
    :param re_sfchs: a list of safe channels
    :param outputpath: a directory of an output file
    :param outputfilename: a name of an output file
    :param number_process: a number of processes in parallel
    :param PlusHOFT: whether to get data of hoft, {'True' or 'False'}, 'False' in default
    :param sigma: an integer to determine the upper bound of the off-source window
    :param duration_max: a maximum value of length of dummy on-source window in sec
    :param trial_duration_sample: a number of dummy on-source window within the total on-source window
    :return None
    '''
    # the path and the file name
    outputPathFileName = os.path.join(outputpath, outputfilename)
    # make an output file
    try:
        f = h5py.File(outputPathFileName, 'w-')
        f.close()
    except IOError:
        print('The file already exists in {}'.format(outputPathFileName))
        print('Please choose a different file name in the configuration file or delete this file if you want to overwrite it')
        sys.exit()
    # remove the astro cashe
    if os.path.isfile('~/.astro/cashe/downlowd/py3/lock') is True:
        os.remove('~/.astro/cashe/downlowd/py3/lock')
    # open a HDF5 file in writing mode
    with h5py.File(outputPathFileName, 'r+') as f:
        for Listsegment in Listsegments:
            IndexSatisfied, list_mat_BG_upper_thresh, array_dummy_duration, list_sample_rates, re_sfchs_used, gpstime, duration, SNR, confi, ID = CreateAllChannels_BGUpperThresh_multband(Listsegment, IFO, re_sfchs, number_process, PlusHOFT, sigma, duration_max, trial_duration_sample)
            if type(duration) == int:  # if some error occurs, List_rho is 0, go to the next iteration
                continue
            else:
                g = f.create_group('gps{:05d}'.format(IndexSatisfied))
                g.attrs['GPS'] = gpstime  # a peak GPS time
                g.attrs['duration'] = duration
                g.attrs['snr'] = SNR  # snr
                g.attrs['confidence'] = confi  # confidence
                g.attrs['id'] = ID
                g.attrs['ifo'] = IFO
                g.create_dataset("dummy_durations", data=array_dummy_duration, compression="gzip", compression_opts=9)  # matrix of frequency bnads
                freq_bands = np.array(Const.freq_bands).tolist()
                for i in range(len(freq_bands)):
                    for j in range(len(freq_bands[0])):
                        freq_bands[i][j] = freq_bands[i][j].encode('utf8')
                g.create_dataset("freq_bands", data=freq_bands, compression="gzip", compression_opts=9) # matrix of frequency bnads

                for ch_label in range(len(re_sfchs_used)):  # iterate through channel labels
                    dset = g.create_dataset("ch{:05d}".format(ch_label), data=list_mat_BG_upper_thresh[ch_label], compression="gzip", compression_opts=9)
                    dset.attrs['SamplingRate'] = list_sample_rates[ch_label]  # target
                    dset.attrs['channel'] = re_sfchs_used[ch_label].encode('utf8')   # target

#--------------------------
# post-process for burn-in
#--------------------------

[docs]def make_bg_up_thesh_interpolate_hdf5(input_dir, input_hdf5_file):
    '''
    description:
        make a dictionry of values of the background upper threshold across dummy on-source window and frequency bands per channel

    USAGE: list_all_dummy_duration_sorted, mat_ch_sorted_dict = make_bg_up_thesh_interpolate_hdf5(input_dir, input_hdf5_file)

    :param input_dir: input directory
    :param input_hdf5_file: name of a HDF5 file
    :return: list_all_dummy_duration_sorted: ascending list of dummy on-source windows
            mat_ch_sorted_dict: a dictionary in which each of the key contains array of values the he background upper threshold per dummy on-source window per frequency band
            where frequency bands are in row from top to bottom and dummy on-source windows are in columns from left to right
    '''
    # path to a HDF5 file which contains all the values of the background upper threshold across channels and frequency bands
    path_input_hdf5 = os.path.join(input_dir, input_hdf5_file)
    if os.path.isfile(path_input_hdf5) is False:
        print("No input background upper threshold h5py file called {}".format(path_input_hdf5))
        print("Exit this program")
        sys.exit()
    # open the input hdf5 file with reading mode
    f = h5py.File(path_input_hdf5, 'r')
    # list of glitches
    list_g = list(f.keys())
    # --------------------------+
    # get the list of channels  |
    # --------------------------+
    list_channels = np.array([])
    g = f[list_g[0]]
    for dset_ind in g.keys():
        if dset_ind[:2] != "ch":
            continue
        channel = g[dset_ind].attrs["channel"].decode('utf8')
        list_channels = np.append(list_channels, channel)
    #-------------------------+
    # get the frequency bands |
    #-------------------------+
    list_freq_bands = g['freq_bands'][()]
    #--------------------------------------------------------------------------------------------------------------+
    # get the background upper thresholds for all the glitches and all the channels across all the frequency bands |
    #--------------------------------------------------------------------------------------------------------------+
    mat_ch_dict = {} # keys are channels
    list_all_dummy_duration = np.array([]) # list of all the dummy durations
    # several dummy durations are chosen per glitch
    for gps_ind in f.keys():  # iterate through glitches
        g = f[gps_ind]
        list_dummy_duration = g['dummy_durations'][()]
        list_all_dummy_duration = np.append(list_all_dummy_duration, list_dummy_duration)
        for ch_ind in range(len(list_channels)):  # iterate through channels
            # extract the matrix of the back ground upper threshold per channel per glitch
            # each element in this matrix is the BG upper threshold value for a given frequency band and a given dummy duration
            # frequency bands are listed in row from 0 to 7, for example
            # dummy durations are in columns
            bg_up_mat = g["ch{:05d}".format(ch_ind)][()]
            if gps_ind == list_g[0]: # if the iteration is the first glitch
                mat_ch_dict[list_channels[ch_ind]] = bg_up_mat # fill the matrix to the dictionary
            else: # otherwise
                # stack to the previous matrix
                mat_ch_dict[list_channels[ch_ind]] = np.hstack([mat_ch_dict[list_channels[ch_ind]], bg_up_mat]) if mat_ch_dict[list_channels[ch_ind]].size else bg_up_mat
    #-----------------------------------------------------------------------+
    # short the upper threshold ascendingly in the order of dummy durations |
    #-----------------------------------------------------------------------+
    sorted_index = np.argsort(list_all_dummy_duration)  # sort index ascending
    list_all_dummy_duration_sorted = list_all_dummy_duration[sorted_index] # sorted dummy durations
    mat_ch_sorted_dict = {}
    for channel in mat_ch_dict.keys(): # iterate through channels
        mat_ch_sorted_dict[channel] = mat_ch_dict[channel][:, sorted_index] # sort the values and fill
    print("Finished extracting values of the background upper threshold")
    return list_all_dummy_duration_sorted, mat_ch_sorted_dict


[docs]def save_interpolate_bg_upper_thres_hdf5(list_all_dummy_duration_sorted, mat_ch_sorted_dict, output_dir, output_hdf5_file, med_abs_sigma=6, poly_degree=10):
    '''
    description:
        1. fit the polynomial function against the background upper threshold as a function of dummy on-source window per freq band per channel
        2. save the polynomial parametes to a HDF5 file

    USAGE: save_interpolate_bg_upper_thres_hdf5(list_all_dummy_duration_sorted, mat_ch_sorted_dict, output_dir, output_hdf5_file, med_abs_sigma=6, poly_degree=10)

    :param list_all_dummy_duration_sorted: ascending list of dummy on-source windows
    :param mat_ch_sorted_dict:  a dictionary in which each of the key contains array of values the he background upper threshold per dummy on-source window per frequency band
            where frequency bands are in row from top to bottom and dummy on-source windows are in columns from left to right
    :param output_dir: output directory
    :param output_hdf5_file: otuput file name
    :param med_abs_sigma: a interger number of median absolute error to remove outliers for fitting
    :param poly_degree: polynomial degree
    :return: None
    '''
    path_output_hdf5 = os.path.join(output_dir, output_hdf5_file)
    try:
        f = h5py.File(path_output_hdf5, 'w-')
        f.close()
    except IOError:
        print('The file already exists in {}'.format(path_output_hdf5))
        os.system('rm {}'.format(path_output_hdf5))

    # open a HDF5 file in writing mode
    with h5py.File(path_output_hdf5, 'a') as f:
        for channel in mat_ch_sorted_dict.keys(): # iterate through channels
            print(channel)
            g = f.create_group(channel)
            bg_up_mat = mat_ch_sorted_dict[channel]
            for band_ind in range(bg_up_mat.shape[0]): # iterate through frequency bandas
                print('freq band {}'.format(band_ind))
                # remove nan samples
                bg_up_bd_nonnan_ind = np.where(np.isnan(bg_up_mat[band_ind, :]) == False)[0]
                sample_x = list_all_dummy_duration_sorted[bg_up_bd_nonnan_ind]
                sample_y = bg_up_mat[band_ind, :][bg_up_bd_nonnan_ind]
                # remove extreme large outliers
                median = np.median(sample_y)
                med_abs_dev = stats.median_absolute_deviation(sample_y)
                if med_abs_dev == 0:  # if all the values are zero, med_abs_dev will be zero
                    med_abs_dev = 1
                non_ext_outliner_ind = \
                np.where((median - med_abs_sigma * med_abs_dev < sample_y) & (sample_y < median + med_abs_sigma * med_abs_dev))[0]
                sample_x = sample_x[non_ext_outliner_ind]
                sample_y = sample_y[non_ext_outliner_ind]
                # fit with a polynomial
                z = np.polyfit(sample_x, sample_y, poly_degree)
                dset = g.create_dataset("BP{band_ind:d}".format(band_ind=int(band_ind)), data=z, compression="gzip", compression_opts=9)
                dset.attrs['channel'] = channel.encode('utf8')
                dset.attrs['freqband'] = int(band_ind)
    print("Finished extrapolate values of the background upper threshold")
    print("Saved as {}".format(path_output_hdf5))
    return None



#---------------
# for target sample
#---------------
[docs]def FindglitchlistextendBG(df, Epoch_lt, TargetGlitchClass, IFO, BGSNR_thre, targetSNR_thre, Confidence_thre, UpperDurationThresh, LowerDurationThresh, UserDefinedDuration, gap, position_duration_bfr_centr, TriggerPeakFreqLowerCutoff=0, TriggerPeakFreqUpperCutoff=8192, targetUpperSNR_thre=np.inf):
    '''
    description:
        1. glitch data set (.csv file)
        2. get the data about the target glitch class
        3. get the subset of the target glitches based on SNR and confidence level threshold a user defines
        4. the preceeding and following BGs are 64 sec-long for every samples regardless the overlapping with any other glitches
        5. return the info of the accepted glitches
    USAGE: Listsegments = FindglitchlistextendBG(df, Epochstart, Epochend, Commissioning_lt, TargetGlitchClass, IFO, BGSNR_thre, targetSNR_thre, Confidence_thre, UpperDurationThresh, LowerDurationThresh, UserDefinedDuration, gap, position_duration_bfr_centr, TriggerPeakFreqLowerCutoff=0, TriggerPeakFreqUpperCutoff=8192, targetUpperSNR_thre=np.inf)

    :param df: GravitySpy meta data in pandas format
    :param Epochstart: starting time of an epoch
    :param Epochend:  end time of an epoch
    :param Commissioning_lt: commissioning time in list
    :param TargetGlitchClass: a target glitch class name (str)
    :param IFO: a type of interferometer (H1, L1, V1)  (str)
    :param BGSNR_thre: an upper threhold of SNR for background glitches, i.e., quiet enough ), float or int
    :param targetSNR_thre: a lower threshold of SNR for target glitches, float or int
    :param Confidence_thre: a threshold of confidence level (float or int )
    :param UpperDurationThresh: an upper bound of duration in sec (float or int)
    :param LowerDurationThresh: a lower bound of duration in sec (float or int)
    :param UserDefinedDuration: user defined duration of a glitch (float or int), 0 in default
    :param gap: a time gap between the target and the background segments in sec, 1 sec in default
    :param position_duration_bfr_centr: proportion of duration for a target segment around a center time, e.g, 0.5 indicates duration is even distributed around a center time, 0.83 indicates 5/6 is before the center time
    :param TriggerPeakFreqLowerCutoff: a lower limit cutoff value of the peak frequency of triggers given by an ETG  for target glitches
    :param TriggerPeakFreqUpperCutoff: an upper limit cutoff value of the peak frequency of triggers given by an ETG queries for target glitches
    :param targetUpperSNR_thre: an upper limit cutoff value of SNR of triggers given by an ETG queries for target glitches
    :param flag: 'Both' or 'Either' taking both backgronds or either the preceding or the following background, respectively to accept glitches
    :return: the list of parameters of glitches passing the above thresholds
        Listsegments contains of
            ListIndexSatisfied: a list of index of glitches
            Listtarget_timeseries_start: a list of a target glitch starting time
            Listtarget_timeseries_end: a list of a target glitch ending time
            Listpre_background_start: a list of preceding background starting time
            Listpre_background_end; a list of preceding background ending time
            Listfol_background_start: a list of following background starting time
            Listfol_background_end: a list of following background ending time
            Listgpstime: a list of GPS times
            Listduration: a list of durations
            ListSNR: a list of SNRs
            Listconfi: a list of confidence levels
            ListID: a list of IDs
    '''


    df_list = []
    # take the data in an Epoch
    for Epoch in Epoch_lt:
        StartT, EndT = Epoch[0], Epoch[1]
        df_epoch = df[(df['GPStime'] >= StartT) & (df['GPStime'] <= EndT)]
        df_list.append(df_epoch)
    df_int = pd.concat(df_list)


    #### improvement for duration 2019-04-05 #############
    #### full durations are too long for capturing the glitch signature so that the solution is to choose T90
    ### In this situation, a gap that is manually chosen will be gap_new = gap_m + 0.05*T100
    FractionEnergy = 1. # (90% of energy )
    # note (1 - FractionEnergy)/2. is the fraction of the one-sided off T90
    # df_int = df_int.assign(gap = gap + (1 - FractionEnergy)/2.*df_int['duration'])
    df_int = df_int.assign(gap=[gap] * df_int.shape[0])
    df_int['duration'] = np.round(FractionEnergy*df_int['duration'], 3) # replace full durations (T100) with T90


    # make the target segments
    # I reserve duration +/- gap around a GPS time of a glitch

    if UserDefinedDuration > 0: # accept a user defined duration
        # redefine only the duration of the target glitch
        df_int.loc[(df_int['ifo']==IFO) & (df_int['label']==TargetGlitchClass), ['duration']] = UserDefinedDuration
    else:
        pass

    df_int = df_int.assign(reserved_time_start = df_int['GPStime'] - position_duration_bfr_centr * df_int['duration'] - df_int['gap'])
    df_int = df_int.assign(reserved_time_end = df_int['GPStime'] + (1 - position_duration_bfr_centr) * df_int['duration'] + df_int['gap'])

    # reserving_time does not include gap
    #df_int = df_int.assign(reserved_time_start = df_int['GPStime'] - position_duration_bfr_centr * df_int['duration'])
    #df_int = df_int.assign(reserved_time_end = df_int['GPStime'] + (1 - position_duration_bfr_centr) * df_int['duration'])

    # exclude glitches with SNR <= BGSNR from df_int as they are quiet enough to be considered as background
    df_int = df_int[df_int['snr'] >= BGSNR_thre] # I think this is ok 2018-12-15


    # data set containing only a target glitch class
    try: # gravity spy and omicron trigger mode
        df_glitch = df_int[(df_int['label'] == TargetGlitchClass) & (df_int['ifo'] == IFO) & (df_int['confidence'] >= Confidence_thre) & (df_int['snr'] >= targetSNR_thre) & (df_int['snr'] <= targetUpperSNR_thre) & (df_int['peakFreq'] >= TriggerPeakFreqLowerCutoff) & (df_int['peakFreq'] <= TriggerPeakFreqUpperCutoff) & (df_int['duration'] >= LowerDurationThresh) & (df_int['duration'] <= UpperDurationThresh)]
    except KeyError: # pyCBC mode as there is no column "peakFreq"
        # add central frequency cuts
        df_glitch = df_int[(df_int['label'] == TargetGlitchClass) & (df_int['ifo'] == IFO) & (df_int['confidence'] >= Confidence_thre) & (df_int['snr'] >= targetSNR_thre) & (df_int['snr'] <= targetUpperSNR_thre) & (df_int['centralFreq'] >= TriggerPeakFreqLowerCutoff) & (df_int['centralFreq'] <= TriggerPeakFreqUpperCutoff) & (df_int['duration'] >= LowerDurationThresh) & (df_int['duration'] <= UpperDurationThresh)]
        # for cWB triggers
        # df_glitch = df_int[(df_int['rho'] > 9) & (df_int['label'] == TargetGlitchClass) & (df_int['ifo'] == IFO) & (df_int['confidence'] >= Confidence_thre) & (df_int['snr'] >= targetSNR_thre) & (df_int['snr'] <= targetUpperSNR_thre) & (df_int['centralFreq'] >= TriggerPeakFreqLowerCutoff) & (df_int['centralFreq'] <= TriggerPeakFreqUpperCutoff) & (df_int['duration'] >= LowerDurationThresh) & (df_int['duration'] <= UpperDurationThresh)]


    print('all the glitches in the class: {}'.format(df_glitch.shape[0]))


    # the list of values satisfied as secure
    ListIndexSatisfied = []
    Listtarget_timeseries_start = []
    Listtarget_timeseries_end = []
    Listpre_background_start = []
    Listpre_background_end = []
    Listfol_background_start = []
    Listfol_background_end = []
    Listgpstime = []
    Listduration = []
    ListSNR = []
    Listconfi = []
    ListID = []
    for ind in range(len(df_glitch)):

        # target glitch's peak GPS time and the duration
        gpstime = df_glitch['GPStime'].iloc[ind] # a peak GPS time
        duration = df_glitch['duration'].iloc[ind] # # a duration
        SNR = df_glitch['snr'].iloc[ind] # snr
        confi = df_glitch['confidence'].iloc[ind] # confidence level
        ID = df_glitch['id'].iloc[ind] # ID
        gap_individual = df_glitch['gap'].iloc[ind]

        if duration > 0.02: # accept the duration of glitch is greater than 0.02 sec
            pass # go to the next line
        else: # if the duration of a glitch is less than 0.02 sec
            continue # go to the next iteration

        # the starting and ending time of the target glitch
        target_timeseries_start = gpstime - duration * position_duration_bfr_centr
        target_timeseries_end = target_timeseries_start + duration

        # take the background
        # the background is around the target glitch
        # preceding background, which is a nearest neighbor segment before target_timeseries_start
        # following background, which is a nearest neighbor segment after target_timeseries_end

        background_duration = 64 # duration # duration of both the proceeding and following background
        pre_background_start = target_timeseries_start - gap_individual - background_duration
        pre_background_end = pre_background_start + background_duration #target_timeseries_start - gap_individual
        fol_background_start = target_timeseries_end + gap_individual
        fol_background_end = fol_background_start + background_duration #target_timeseries_end + gap_individual + background_duration

        print('{:.1f}, {:.1f}, {:.1f}'.format(pre_background_end - pre_background_start,
                                              fol_background_end - fol_background_start, duration))
        if round(pre_background_end - pre_background_start, 3) < round(duration, 3) or round(
                fol_background_end - fol_background_start, 3) < round(duration, 3):
            sys.exit()

        ListIndexSatisfied.append(ind)  # append the index satisfied as secure
        Listtarget_timeseries_start.append(target_timeseries_start)
        Listtarget_timeseries_end.append(target_timeseries_end)
        Listpre_background_start.append(pre_background_start)
        Listpre_background_end.append(pre_background_end)
        Listfol_background_start.append(fol_background_start)
        Listfol_background_end.append(fol_background_end)
        Listgpstime.append(gpstime)
        Listduration.append(duration)
        ListSNR.append(SNR)
        Listconfi.append(confi)
        ListID.append(ID)

    # zipped list of segments
    Listsegments = list(
        zip(ListIndexSatisfied, Listtarget_timeseries_start, Listtarget_timeseries_end, Listpre_background_start,
            Listpre_background_end, Listfol_background_start, Listfol_background_end, Listgpstime, Listduration,
            ListSNR, Listconfi, ListID))
    return Listsegments

[docs]def Multiprocess_whitening_for_target(full_timeseries, target_timeseries_start, target_timeseries_end):
    '''
    description:
        This is used for multi processing for whitening segments

    USAGE: whitened_fft_target, sample_rate, DURATION = Multiprocess_whitening_for_target(full_timeseries, target_timeseries_start, target_timeseries_end)

    :param full_timeseries: time series comprising target and BGs
    :param target_timeseries_start: a start time of a target segment
    :param target_timeseries_end: an end time of a target segment
    :return:    whitened_fft_target: whitened fft of a target segment
                sample_rate: sampling rate of this channel
                DURATION: a duration of a target segment
    '''
    sample_rate = full_timeseries.sample_rate.value
    DURATION = target_timeseries_end - target_timeseries_start
    fourth_decimal_num_of_duration = (DURATION * 1000 - floor(DURATION * 1000)) * 10  # fourth decimal number
    DURATION = DURATION - fourth_decimal_num_of_duration * 0.0001  # truncate at the fourth decimal number
    # revise the duration to make an error in whitening process not not to occur
    # ValueError occurs in the whitening when the lengths of whitened data, whitening data and the filter are different
    third_decimal_num_of_duration = round((round(DURATION, 3) - round(DURATION, 2)) * 1000)
    if third_decimal_num_of_duration % 2 == 1:  # odd number at the third decimal point
        DURATION = DURATION - 0.001  # make the duration with an even number at the third decimal point
    elif third_decimal_num_of_duration % 2 == 0:  # even number at the third decimal point
        pass  # keep the original duration as it is
    DURATION = round(DURATION, 3)  # round it up at 3rd decimal point as the above three lines creates small number below the 3rd decimal point
    full_timeseries_start = full_timeseries.span[0]
    full_timeseries_end = full_timeseries.span[1]
    ave_background_asd = full_timeseries.crop(end=full_timeseries_start+128).asd(DURATION, DURATION * 0.5, method='median').value
    # whitening Target
    target_timeseries = full_timeseries.crop(start=target_timeseries_start, end=target_timeseries_end)
    if (full_timeseries.value == 0).all() is True:  # if the time series is all zero
        whitened_fft_target = np.zeros(ave_background_asd.shape[0])
    elif ave_background_asd[ave_background_asd != 0].shape[0] == 0:  # if the time series is all zero
        whitened_fft_target = np.zeros(ave_background_asd.shape[0])
    else:
        fft_target = target_timeseries.asd(DURATION, DURATION * 0.5, method='median').value # fft of the target
        # whitening the target using the average BG
        whitened_fft_target = fft_target / ave_background_asd
    return whitened_fft_target, sample_rate, DURATION

[docs]def SaveOnlyTargetHDF5_OFFLINE(Listsegments, re_sfchs, IFO, outputpath, outputfilename, number_process, PlusHOFT='False'):
    '''
    description:
        THIS IS USED FOR "OFFLINE" MODE
        0. assuming Listsegments is given by Findglitchlist()
        1.take the information of the list of allowed target and a preceding and following segments
        2. whiten a target segmement based on the average background segment
        3. find whitened FFT
        4. save the whitened target and backgrounds FFTs
        Note this depends on
    USAGE: SaveOnlyTargetHDF5_OFFLINE(Listsegments, re_sfchs, IFO, outputpath, outputfilename, number_process, PlusHOFT='False')

    :param Listsegments: a list of segment parameters
    :param re_sfchs: a list of safe channels
    :param outputpath: a directory of an output file
    :param outputfilename: a name of an output file
    :param number_process: a number of processes in parallel
    :param PlusHOFT: whether to get data of hoft, {'True' or 'False'}, 'False' in default
    '''
    # check if the output directory exists
    if os.path.isdir(outputpath) is False:
        print("The directory for the output HDF5 file does not exist: path = {}".format(outputpath))
        print("Please specify the existing directory for it.")
        sys.exit()
    # the list of the safe channels
    channels = ['{0}:{1}'.format(IFO, ch) for ch in re_sfchs]
    # the path and the file name
    outputPathFileName = os.path.join(outputpath, outputfilename)
    # make an output file
    try:
        f = h5py.File(outputPathFileName, 'w-')
        f.close()
    except IOError:
        print('The file already exists in {}'.format(outputPathFileName))
        print('Please choose a different file name in a configuration file or delete this file if you want to overwrite it')
        sys.exit()
    # remove the astro cashe
    if os.path.isfile('~/.astro/cashe/downlowd/py3/lock') is True:
        os.remove('~/.astro/cashe/downlowd/py3/lock')
    # open a HDF5 file in writing mode
    with h5py.File(outputPathFileName, 'r+') as f:
        for IndexSatisfied, target_timeseries_start, target_timeseries_end, pre_background_start, pre_background_end, fol_background_start, fol_background_end, gpstime, duration, SNR, confi, ID in Listsegments:
            t0 = time.time()
            try: # deal with time series in bad state
                if pre_background_start != 0 and pre_background_end != 0 and fol_background_start != 0 and fol_background_end != 0:  # if there is an available proceeding background and if there is an available following background
                    full_timeseriesDict = TimeSeriesDict.find(channels, pre_background_start, fol_background_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process)
                elif pre_background_start != 0 and pre_background_end != 0 and fol_background_start == 0 and fol_background_end == 0: # if there is an available proceeding background and if there is NO available following background
                    full_timeseriesDict = TimeSeriesDict.find(channels, pre_background_start, target_timeseries_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process)
                elif pre_background_start == 0 and pre_background_end == 0 and fol_background_start != 0 and fol_background_end != 0: # if there is NO available proceeding background and if there is an available following background
                    full_timeseriesDict = TimeSeriesDict.find(channels, target_timeseries_start, fol_background_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process)
                else:
                    pass
                channels_used = channels
                if PlusHOFT == 'True': # get the data of hoft
                    if pre_background_start != 0 and pre_background_end != 0 and fol_background_start != 0 and fol_background_end != 0:  # if there is an available proceeding background and if there is an available following background
                        full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], pre_background_start, fol_background_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process)
                    elif pre_background_start != 0 and pre_background_end != 0 and fol_background_start == 0 and fol_background_end == 0:  # if there is an available proceeding background and if there is NO available following background
                        full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], pre_background_start, target_timeseries_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process)
                    elif pre_background_start == 0 and pre_background_end == 0 and fol_background_start != 0 and fol_background_end != 0:  # if there is NO available proceeding background and if there is an available following background
                        full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], target_timeseries_start, fol_background_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process)
                    else:
                        pass
                    full_timeseriesDict.append(full_timeseriesDict_hoft) # append
                    del full_timeseriesDict_hoft # delete the hoft object
                    channels_used = np.append(channels, '{}:GDS-CALIB_STRAIN'.format(IFO))
                # =====================================================
            except IndexError:
                print('Bad data, go to a next GPS')
                continue # go to a next GPS time
            except RuntimeError:
                print('RuntimeError, go to a next GPS')
                continue
            except IOError: # error when getting time series
                print('IOError, go to a next GPS')
                continue
            except ValueError: # ValueError: Failed to read /hdfs/frames/O3/raw/H1/H-H1_R-12384/H-H1_R-1238489600-64.gwf: Unable to map from the stream class id (0) to internal class id at position: 18446744073709551601 [streamref: class: 0 instance: 0 length: 0 ]
                print('ValueError, go to a next GPS')
                continue
            g = f.create_group('gps{:05d}'.format(IndexSatisfied))
            g.attrs['GPS'] = gpstime # a peak GPS time
            g.attrs['duration'] = duration
            g.attrs['snr'] = SNR # snr
            g.attrs['confidence'] = confi # confidence
            g.attrs['id'] = ID
            g.attrs['ifo'] = channels[0].split(':')[0]
            try:
                ch_labels = range(len(channels_used)) # a list of channel labels
                List_whitened_fft_target = [] # a list of whitened fft of target
                List_sample_rate = [] # a list of sample rates
                List_DURATION = [] # a list of durations
                # whitened target and BG segments with multi processes
                with concurrent.futures.ProcessPoolExecutor(max_workers=number_process) as executor:
                    for whitened_fft_target, sample_rate, DURATION in executor.map(Multiprocess_whitening_for_target,
                                                                                                    full_timeseriesDict.values(),
                                                                                                    itrtls.repeat(target_timeseries_start),
                                                                                                    itrtls.repeat(target_timeseries_end)
                                                                                                    ):
                        List_whitened_fft_target.append(whitened_fft_target) # fill
                        List_sample_rate.append(sample_rate) # fill
                        List_DURATION.append(DURATION) # fill

                for ch_label in ch_labels: # iterate through channel labels
                    dsetTarget = g.create_dataset("chT{:05d}".format(ch_label), data=List_whitened_fft_target[ch_label], compression="gzip", compression_opts=9)  # target
                    dsetTarget.attrs['SamplingRate'] = List_sample_rate[ch_label]  # target
                    dsetTarget.attrs['channel'] = channels_used[ch_label].split(':')[1]  # target
                    dsetTarget.attrs['duration'] = List_DURATION[ch_label]  # target
            except ValueError:
                del f['gps{:05d}'.format(IndexSatisfied)]
                print('*** delete group gps{:05d} due to ASD estimation error ***'.format(IndexSatisfied))
                continue # go to a next GPS time
            except IOError: # IOError: Driver write request failed
                del f['gps{:05d}'.format(IndexSatisfied)]
                print('*** delete group gps{:05d} due to memory error ***'.format(IndexSatisfied))
                continue # go to a next GPS time
            print('Complete saving a sample as gps{:05d}'.format(IndexSatisfied))
            t1 = time.time()
            print('Time taken: {} s'.format(t1 - t0))

[docs]def cal_importance_single_channel_singl_freqband_from_bg_up_bd_prior(whitened_fft_target, sampling_rate, DURATION, poly_para, LowerCutOffFreq, UpperCutOffFreq):
    """
    description:
        calculate the importance for a single channel in a given frequency band
    USAGE: Count = cal_importance_single_channel_singl_freqband_from_bg_up_bd_prior(whitened_fft_target, sampling_rate, DURATION, poly_para, LowerCutOffFreq, UpperCutOffFreq)

    :param whitened_fft_target: on-source window normalized spectrum
    :param sampling_rate: sample rate
    :param DURATION: duration of on-source window
    :param poly_para: polynomial parameters of the fit of the background upper threshold as a function of on-source window length
    :param LowerCutOffFreq:
    :param UpperCutOffFreq:
    :return:
    """
    sp = whitened_fft_target
    bg_up_thresh_fitted = np.poly1d(poly_para)
    GausCeiling = bg_up_thresh_fitted(DURATION)
    if np.isnan(sp).any() != True:  # if sp has data measured
        # low, high, or band pass filter if we know which frequency band is important
        if LowerCutOffFreq != 'None' and UpperCutOffFreq == 'None':  # high-pass filter
            ## pT, pP, and pF should be zero if LowerCutOffFreq is greater than the half of the sampling rate as there is no data. However pT would be non-zero due to the bad resolution (a very short giltch)
            if sampling_rate / 2 <= LowerCutOffFreq:
                sp = np.array([0])
            else:
                LowerCutOffElement = int(LowerCutOffFreq * DURATION)
                if LowerCutOffElement <= sp.shape[-1]:  # if LowCutOffElement is within the length of the vector
                    sp = sp[LowerCutOffElement:]
                else:  # if LowerCutOffElement is greater than the length of the vector
                    sp = np.array([0])
        elif LowerCutOffFreq == 'None' and UpperCutOffFreq != 'None':  # low-pass filter
            # even if UpperCutOffElement is greater than the length of the vector is works fine
            UpperCutOffElement = int(UpperCutOffFreq * DURATION)
            sp = sp[:UpperCutOffElement]
        elif LowerCutOffFreq != 'None' and UpperCutOffFreq != 'None':  # band pass filter
            ## pT, pP, and pF should be zero if LowerCutOffFreq is greater than the half of the sampling rate as there is no data. However pT would be non-zero due to the bad resolution (a very short giltch)
            if sampling_rate / 2 <= LowerCutOffFreq:
                sp = np.array([0])
            else:
                LowerCutOffElement = int(LowerCutOffFreq * DURATION)
                UpperCutOffElement = int(UpperCutOffFreq * DURATION)
                if LowerCutOffElement <= sp.shape[-1]:  # if LowCutOffElement is within the length of the vector
                    sp = sp[LowerCutOffElement:UpperCutOffElement]
                else:  # if LowCutOffElement is greater than the length of the vector
                    sp = np.array([0])
        else:
            pass
        if GausCeiling == 0:
            Count = 0
        else:
            Count = sp[sp > GausCeiling].shape[0] / float(sp.shape[0])
    else:
        Count = 0  # manually put zero because this channel has no contribution at this glitch ID
    return Count

[docs]def HierarchyChannelAboveThreshold_single_channel_multiband_from_bg_up_bd_prior(whitened_fft_target, sampling_rate, duration, list_poly_para):
    '''
    description:
         calculate the importance for a signle channel for frequency bands

    USAGE: Counts_in_multibands = HierarchyChannelAboveThreshold_single_channel_multiband_from_bg_up_bd_prior(whitened_fft_target, sampling_rate, duration, list_poly_para)

    :param whitened_fft_target: whitened fft of the on-source window
    :param duration: a duration of the on-source window
    :param sampling_rate: sampling rate of a channel
    :param list_poly_para: a list of polynomial fit of the background upper threshold per freq band
    :return: Counts_in_multibands: values of importance in different frequency bands, numpy array
    '''
    # iterate different frequency bands in the list
    Counts_in_multibands = np.array([cal_importance_single_channel_singl_freqband_from_bg_up_bd_prior(whitened_fft_target, sampling_rate, duration, list_poly_para[freq_band_ind], LowerCutOffFreq=Const.freq_bands[freq_band_ind][0], UpperCutOffFreq=Const.freq_bands[freq_band_ind][1]) for freq_band_ind in range(len(Const.freq_bands))])
    return Counts_in_multibands

[docs]def CreateRho_single_channel_multiband_from_bg_up_bd_prior(full_timeseries, target_timeseries_start, target_timeseries_end, list_poly_para):
    '''
    discription:
        1. calculate the normalized spectrum
        2. compute the value of importance for a single channel across frequency bands
    USAGE: Counts_in_multibands, sample_rate = CreateRho_single_channel_multiband_from_bg_up_bd_prior(full_timeseries, target_timeseries_start, target_timeseries_end, list_poly_para)

    :param full_timeseries: the full time series in gwpy object including on- and off source windows
    :param target_timeseries_start: the start time of the on-source window
    :param target_timeseries_end: the end time of the on-source window
    :param list_poly_para: a list of polynomial fit of the background upper threshold per freq band
    :return:
        Counts_in_multibands: values of importance in different frequency bands where importance is a fraction of frequency bins in a frequency range above an upper bound of the off-source window for a single channel
        sample_rate: a sampling rate of a single channel
    '''
    # calculate the whitened FFT of the on- and off-source windows
    whitened_fft_target, sample_rate, duration = Multiprocess_whitening_for_target(full_timeseries, target_timeseries_start, target_timeseries_end)
    # calculate the importance
    Counts_in_multibands = HierarchyChannelAboveThreshold_single_channel_multiband_from_bg_up_bd_prior(whitened_fft_target, sample_rate, duration, list_poly_para)
    return Counts_in_multibands, sample_rate

[docs]def CreateAllChannels_rho_multband_from_bg_up_bd_prior(Listsegment, IFO, re_sfchs, number_process, PlusHOFT, hdf5_obj_bg_up_thresh):
    '''
    description:
        1. use a single glitch time
        2. query timeseries of all the channels around a glitch
        3. calculate the normalized spectrum
        4. compute the value of importance for a single channel across frequency bands
    USAGE: IndexSatisfied, Mat_Count_in_multibands, list_sample_rates, re_sfchs, gpstime, duration, SNR, confi, ID  = CreateAllChannels_rho_multband_from_bg_up_bd_prior(Listsegment, IFO, re_sfchs, number_process, PlusHOFT, hdf5_obj_bg_up_thresh)

    :param Listsegment: a list of segment parameters
    :param IFO
    :param channels: a list of safe channels
    :param number_process: a number of processes in parallel
    :param PlusHOFT: whether to get data of hoft, {'True' or 'False'}
    :param hdf5_obj_bg_up_thresh: a HDF5 object that contains the polynomial parameters of the fit that represent the background upper theshold as a functin of on-source window length for all the channels and freq bands
    :return:
        IndexSatisfied: glitch index
        Mat_Count_in_multibands: a matrix of rho where frequencies in rows and channels in columns, numpy array
        list_sample_rates: a list of sampling rates of channels, numpy array
        re_sfchs: list of channels without "IFO:" at the beginning
        gpstime: a gps time
        duration: a value of duration
        SNR: signal to noise ratio in the h(t)
        conf: a confidence level of classification of a glitch, provided by Gravity Spy. Otherwise None
        ID: a glitch ID, provided by Gravity Spy in usual
    '''
    # get the polynomical parameters fitted the background upper threshold across channels and the frequency bands
    poly_para_dict = {}
    list_g = list(hdf5_obj_bg_up_thresh.keys())
    num_freq_band = len(list(hdf5_obj_bg_up_thresh[list_g[0]].keys()))
    for ch in hdf5_obj_bg_up_thresh.keys():
        g = hdf5_obj_bg_up_thresh[ch]
        poly_para_dict[ch] = [g["BP{:d}".format(freq_band_ind)][()] for freq_band_ind in range(num_freq_band)]
    channels = ['{0}:{1}'.format(IFO, ch) for ch in re_sfchs]
    # reorder the keys in the backgroud upper thresh dictionary
    poly_para_dict = {ch: poly_para_dict[ch] for ch in re_sfchs}
    IndexSatisfied, target_timeseries_start, target_timeseries_end, pre_background_start, pre_background_end, fol_background_start, fol_background_end, gpstime, duration, SNR, confi, ID = Listsegment
    t0 = time.time()
    # extracting the target data and the proceeding and following backgrounds
    try: # deal with time series in bad state
        if pre_background_start != 0 and pre_background_end != 0 and fol_background_start != 0 and fol_background_end != 0:  # if there is an available proceeding background and if there is an available following background
            full_timeseriesDict = TimeSeriesDict.find(channels, pre_background_start, fol_background_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process)
        elif pre_background_start != 0 and pre_background_end != 0 and fol_background_start == 0 and fol_background_end == 0: # if there is an available proceeding background and if there is NO available following background
            full_timeseriesDict = TimeSeriesDict.find(channels, pre_background_start, target_timeseries_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process)
        elif pre_background_start == 0 and pre_background_end == 0 and fol_background_start != 0 and fol_background_end != 0: # if there is NO available proceeding background and if there is an available following background
            full_timeseriesDict = TimeSeriesDict.find(channels, target_timeseries_start, fol_background_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process)
        else:
            pass
        if PlusHOFT == 'True': # get the data of hoft
            if pre_background_start != 0 and pre_background_end != 0 and fol_background_start != 0 and fol_background_end != 0:  # if there is an available proceeding background and if there is an available following background
                full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], pre_background_start, fol_background_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process)
            elif pre_background_start != 0 and pre_background_end != 0 and fol_background_start == 0 and fol_background_end == 0:  # if there is an available proceeding background and if there is NO available following background
                full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], pre_background_start, target_timeseries_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process)
            elif pre_background_start == 0 and pre_background_end == 0 and fol_background_start != 0 and fol_background_end != 0:  # if there is NO available proceeding background and if there is an available following background
                full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], target_timeseries_start, fol_background_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process)
            else:
                pass
            full_timeseriesDict.append(full_timeseriesDict_hoft) # append
            del full_timeseriesDict_hoft # delete the hoft object
            re_sfchs = np.append(re_sfchs, 'GDS-CALIB_STRAIN')
    except IndexError as err:
        print("IndexError: {}".format(err.message))
        # IndexError: After reading object of type: FrAdcData stream is no longer in good state. ( Frmae: 0 ChannelType: L1:ASC-X_TR_A_NSUM_OUT_DQ offset: 932092099)
        return 0, 0, 0, 0, 0, 0, 0, 0, 0
    except RuntimeError as err:
        print("RuntimeError: {}".format(err.message))
        return 0, 0, 0, 0, 0, 0, 0, 0, 0
    except IOError as err: # error when getting time series
        print('IOError: {}'.format(err.message))
        return 0, 0, 0, 0, 0, 0, 0, 0, 0
    except ValueError: # ValueError: Failed to read /hdfs/frames/O3/raw/H1/H-H1_R-12384/H-H1_R-1238489600-64.gwf: Unable to map from the stream class id (0) to internal class id at position: 18446744073709551601 [streamref: class: 0 instance: 0 length: 0 ]
        print('ValueError')
        return 0, 0, 0, 0, 0, 0, 0, 0, 0
    try:
        Mat_Count_in_multibands = np.array([]) # a list of rho
        list_sample_rates = np.array([])
        # whitened target and BG segments with multi processes
        with concurrent.futures.ProcessPoolExecutor(max_workers=number_process) as executor:
            for Counts_in_multibands, sample_rate in executor.map(CreateRho_single_channel_multiband_from_bg_up_bd_prior,
                                    full_timeseriesDict.values(),
                                    itrtls.repeat(target_timeseries_start),
                                    itrtls.repeat(target_timeseries_end),
                                    poly_para_dict.values()
                                    ):
                # fill
                #Mat_Count_in_multibands = np.vstack([Mat_Count_in_multibands, Counts_in_multibands]) if Mat_Count_in_multibands.size else Counts_in_multibands
                Counts_in_multibands = Counts_in_multibands.reshape(Counts_in_multibands.shape[0], 1)
                Mat_Count_in_multibands = np.hstack([Mat_Count_in_multibands, Counts_in_multibands]) if Mat_Count_in_multibands.size else Counts_in_multibands
                list_sample_rates = np.append(list_sample_rates, sample_rate)
    except ValueError:
        print('*** delete group gps{:05d} due to ASD estimation error ***'.format(IndexSatisfied))
        return 0, 0, 0, 0, 0, 0, 0, 0, 0
    except IOError: # IOError: Driver write request failed
        print('*** delete group gps{:05d} due to memory error  ***'.format(IndexSatisfied))
        return 0, 0, 0, 0, 0, 0, 0, 0, 0
    t1 = time.time()
    print('Time taken: {} s'.format(t1 - t0))
    return IndexSatisfied, Mat_Count_in_multibands, list_sample_rates, re_sfchs, gpstime, duration, SNR, confi, ID  # pass the parameters

[docs]def SaveOnlyTargetHDF5_multiband_OFFLINE_from_bg_up_bd_prior(Listsegments, re_sfchs, IFO, outputpath, outputfilename, number_process, path_hdf5_bg_up_thresh, PlusHOFT='False'):
    '''
    description:
        THIS IS USED FOR "OFFLINE" MODE
        1. take the information of the list of allowed target and a preceding and following segments
        2. query time series and get normalized spectrum and calculate importance
        4. save the whitened target and backgrounds FFTs
        Note this depends on
    USAGE: SaveOnlyTargetHDF5_multiband_OFFLINE_from_bg_up_bd_prior(Listsegments, re_sfchs, IFO, outputpath, outputfilename, number_process, path_hdf5_bg_up_thresh, PlusHOFT='False')

    :param Listsegments: a list of segment parameters
    :param re_sfchs: a list of safe channels
    :param outputpath: a directory of an output file
    :param outputfilename: a name of an output file
    :param number_process: a number of processes in parallel
    :param path_hdf5_bg_up_thresh: path to a HDF5 file that contains the polynomial fit of the background upper threshold
    :param PlusHOFT: whether to get data of hoft, {'True' or 'False'}, 'False' in default
    :return None
    '''
    # load a HDF5 file that contains the polynomical parameters extrapolating the background upper threshold
    if os.path.isfile(path_hdf5_bg_up_thresh) is False:
        print("No such file for the BG upper threshold: {}".format(path_hdf5_bg_up_thresh))
    f_bg_up_thresh = h5py.File(path_hdf5_bg_up_thresh, 'r')
    # the path and the file name
    outputPathFileName = os.path.join(outputpath, outputfilename)
    # make an output file
    try:
        f = h5py.File(outputPathFileName, 'w-')
        f.close()
    except IOError:
        print('The file already exists in {}'.format(outputPathFileName))
        print('Please choose a different file name in the configuration file or delete this file if you want to overwrite it')
        sys.exit()

    # remove the astro cashe
    if os.path.isfile('~/.astro/cashe/downlowd/py3/lock') is True:
        os.remove('~/.astro/cashe/downlowd/py3/lock')

    # open a HDF5 file in writing mode
    with h5py.File(outputPathFileName, 'r+') as f:
        for Listsegment in Listsegments:
            IndexSatisfied, Mat_Count_in_multibands, list_sample_rates, re_sfchs_used, gpstime, duration, SNR, confi, ID = CreateAllChannels_rho_multband_from_bg_up_bd_prior(Listsegment, IFO, re_sfchs, number_process, PlusHOFT, f_bg_up_thresh)
            if type(duration) == int:  # if some error occurs, List_rho is 0, go to the next iteration
                continue
            else:
                g = f.create_group('gps{:05d}'.format(IndexSatisfied))
                g.attrs['GPS'] = gpstime  # a peak GPS time
                g.attrs['duration'] = duration
                g.attrs['snr'] = SNR  # snr
                g.attrs['confidence'] = confi  # confidence
                g.attrs['id'] = ID
                g.attrs['ifo'] = IFO
                dset_mat_rho = g.create_dataset("mat_rho", data=Mat_Count_in_multibands, compression="gzip", compression_opts=9)  # a matrix of rho
                dset_sample_rates = g.create_dataset("sample_rates", data=list_sample_rates, compression="gzip", compression_opts=9)  # list of sample rates of channels
                dset_channels = g.create_dataset("channels", data=[re_sfch.encode('utf8') for re_sfch in re_sfchs_used], compression="gzip", compression_opts=9)  # list of channels
                freq_bands = np.array(Const.freq_bands).tolist()
                for i in range(len(freq_bands)):
                    for j in range(len(freq_bands[0])):
                        freq_bands[i][j] = freq_bands[i][j].encode('utf8')
                dset_freq_bands = g.create_dataset("freq_bands", data=freq_bands, compression="gzip", compression_opts=9) # matrix of frequency bnads
                #dsetTarget.attrs['duration'] = List_DURATION[ch_label]  # target

    with h5py.File(outputPathFileName, 'r') as f:
        print(f['gps{:05d}'.format(IndexSatisfied)]['mat_rho'][()] )



#
#
# outputpath = "/home/kentaro.mogushi/public_html/longlive/Plots/GravitySpy/test_extendBG/null_m/HDF5data"
# outputfilename = "MagnetometerL1.h5"
#
# list_all_dummy_duration_sorted_file, mat_ch_sorted_dict_file = make_bg_up_thesh_interpolate_hdf5(outputpath, outputfilename)
# save_interpolate_bg_upper_thres_hdf5(list_all_dummy_duration_sorted_file, mat_ch_sorted_dict_file, '.', 'bg_up_thre.h5')
#
#
#
# #%%%%%%%%%%%%%%%%
#
#
# #------
# # target
# #---------
# path = "/home/kentaro.mogushi/public_html/longlive/Plots/GravitySpy/test_extendBG/target/HDF5data/MagnetometerL1.h5"
# f_t = h5py.File(path, 'r')
# list_g = list(f_t.keys())
# # --- channels ----
# list_channels = np.array([])
# g = f_t[list_g[0]]
# for dset_ind in g.keys():
#     if dset_ind[:2] != "ch":
#         continue
#     channel = g[dset_ind].attrs["channel"]
#     list_channels = np.append(list_channels, channel)
#
#
#
# ch = 'PEM-EX_MAG_EBAY_SUSRACK_QUAD_SUM_DQ'
# #ch = 'ASC-SRC2_Y_OUT_DQ'
# ch_ind_int = np.where(list_channels == ch)[0][0]
# list_sp_mean = np.array([])
# list_sp_std = np.array([])
# list_duration = np.array([])
# list_gpstime = np.array([])
# list_sp = []
# for gps in f_t.keys():
#     g = f_t[gps]
#     sp = g['chT{:05d}'.format(ch_ind_int)][()]
#     list_sp.append(sp)
#     sp_mean = sp.mean()
#     sp_std = sp.std(ddof=1)
#     gpstime = g.attrs['GPS']
#     list_gpstime = np.append(list_gpstime, gpstime)
#     duration = g['chT{:05d}'.format(ch_ind_int)].attrs['duration']
#     list_sp_mean = np.append(list_sp_mean, sp_mean)
#     list_sp_std = np.append(list_sp_std, sp_std)
#     list_duration = np.append(list_duration, duration)
#
#
#
# #-----------
# # priori
# #------------
# outputpath = "/home/kentaro.mogushi/public_html/longlive/Plots/GravitySpy/test_extendBG/null_m/HDF5data"
# outputfilename = "MagnetometerL1.h5"
#
#
# path_outputfile = os.path.join(outputpath, outputfilename)
#
#
# f = h5py.File(path_outputfile, 'r')
# list_g = list(f.keys())
#
# #-------------------
# # get the list of channels
# #---------------------
# list_channels = np.array([])
# g = f[list_g[0]]
# for dset_ind in g.keys():
#     if dset_ind[:2] != "ch":
#         continue
#     channel = g[dset_ind].attrs["channel"].decode('utf8')
#     list_channels = np.append(list_channels, channel)
#     list_freq_bands = g['freq_bands'][()]
# #----------------
# # get dummy durations
# # freq bands
# #----------------
#
#
# ch_ind_int = np.where(list_channels == ch)[0][0]
#
# # -- get the backgournd upper thresshod for all the glitches and all the channels
# mat_ch_dict = {}
# list_all_dummy_duration = np.array([])
# for gps_ind in f.keys(): # iterate glitches
#     #if gps_ind != list_g[0]:
#     #    continue
#     g = f[gps_ind]
#     list_dummy_duration = g['dummy_durations'][()]
#     list_all_dummy_duration = np.append(list_all_dummy_duration, list_dummy_duration)
#     for ch_ind in range(len(list_channels)): # iterate channels
#         #if ch_ind != ch_ind_int:
#         #    continue
#         #print(ch_ind)
#         bg_up_mat = g["ch{:05d}".format(ch_ind)][()]
#         if gps_ind == list_g[0]:
#             mat_ch_dict[list_channels[ch_ind]] = bg_up_mat
#         else:
#             mat_ch_dict[list_channels[ch_ind]] = np.hstack([mat_ch_dict[list_channels[ch_ind]], bg_up_mat]) if mat_ch_dict[list_channels[ch_ind]].size else bg_up_mat
#
#
# #-- short the upper threshold ascendingly in the order of dummy durations
# sorted_index = np.argsort(list_all_dummy_duration) # sort index ascending
# list_all_dummy_duration_sorted = list_all_dummy_duration[sorted_index]
# mat_ch_sorted_dict = {}
# for channel in mat_ch_dict.keys():
#     mat_ch_dict[channel][:, sorted_index]
#     mat_ch_sorted_dict[channel] = mat_ch_dict[channel][:, sorted_index]
#
#
# # -- fit
#
# file_bg_up_th_hdf5 = 'bg_up_thre.h5'
# try:
#     f = h5py.File(file_bg_up_th_hdf5, 'w-')
#     f.close()
# except IOError:
#     print('The file already exists in {}'.format(file_bg_up_th_hdf5))
#     os.system('rm {}'.format(file_bg_up_th_hdf5))
# # open a HDF5 file in writing mode
# with h5py.File(file_bg_up_th_hdf5, 'a') as f:
#     for channel in list_channels:
#         print(channel)
#         g = f.create_group(channel)
#         for band_ind in range(bg_up_mat.shape[0]):
#             print('freq band {}'.format(band_ind))
#             # remove nan samples
#             bg_up_bd_nonnan_ind = np.where(np.isnan(mat_ch_sorted_dict[ch][band_ind, :]) == False)[0]
#             sample_x = list_all_dummy_duration_sorted[bg_up_bd_nonnan_ind]
#             sample_y = mat_ch_sorted_dict[ch][band_ind, :][bg_up_bd_nonnan_ind]
#             # remove extreme large outliers
#             median = np.median(sample_y)
#             med_abs_dev = stats.median_absolute_deviation(sample_y)
#             if med_abs_dev == 0: # if all the values are zero, med_abs_dev will be zero
#                 med_abs_dev = 1
#             non_ext_outliner_ind = np.where((median - 6 * med_abs_dev < sample_y) & (sample_y < median + 6 * med_abs_dev))[0]
#             sample_x = sample_x[non_ext_outliner_ind]
#             sample_y = sample_y[non_ext_outliner_ind]
#             z = np.polyfit(sample_x, sample_y, 10)
#             print('0000')
#             dset = g.create_dataset("BP{band_ind:d}".format(band_ind=int(band_ind)), data=z, compression="gzip", compression_opts=9)
#             print('11111')
#             dset.attrs['channel'] = channel.encode('utf8')
#             print('22222')
#             dset.attrs['freqband'] = int(band_ind)
#
#
#
#
# plot
#
# f_up = h5py.File("/home/kentaro.mogushi/public_html/longlive/Plots/GravitySpy/test_extendBG/bg_up_thre.h5", 'r')
# for band_ind in range(8)[7:]:
#     plt.plot(list_all_dummy_duration_sorted, mat_ch_sorted_dict[ch][band_ind, :], label='band {}'.format(band_ind), color='gray', alpha=0.5)
#     # remove nan samples
#     bg_up_bd_nonnan_ind = np.where(np.isnan(mat_ch_sorted_dict[ch][band_ind, :]) == False)[0]
#     sample_x = list_all_dummy_duration_sorted[bg_up_bd_nonnan_ind]
#     sample_y = mat_ch_sorted_dict[ch][band_ind, :][bg_up_bd_nonnan_ind]
#     # remove extreme large outliers
#     median = np.median(sample_y)
#     med_abs_dev = stats.median_absolute_deviation(sample_y)
#     if med_abs_dev == 0:
#         med_abs_dev = 1
#     non_ext_outliner_ind = np.where((median - 6 * med_abs_dev < sample_y) & (sample_y < median + 6 * med_abs_dev))[0]
#     sample_x = sample_x[non_ext_outliner_ind]
#     sample_y = sample_y[non_ext_outliner_ind]
#     z = np.polyfit(sample_x, sample_y, 10)
#     print(z)
#     p = np.poly1d(z)
#     plt.plot(list_all_dummy_duration_sorted, p(list_all_dummy_duration_sorted), color='blue', ls='-', alpha=0.5,
#              label='polynomial fit')
#     # ####
#     z = f_up[ch]["BP{:d}".format(band_ind)][()]
#     print(z)
#     p = np.poly1d(z)
#     plt.plot(list_all_dummy_duration_sorted, p(list_all_dummy_duration_sorted), color='blue', ls='--', alpha=0.5, label='polynomial fit from the file')
#     #plt.errorbar(list_duration, list_sp_mean, yerr=list_sp_std, fmt='o', color='red')
#     plt.legend()
#     plt.title(ch)
#     plt.yscale('log')
#     plt.xscale('log')
#     #plt.ylim(median - 6 * med_abs_dev, median + 6 * med_abs_dev)
#     plt.ylabel("upper cut")
#     plt.xlabel("duration [sec]")
#     plt.savefig('{}.png'.format(ch))
#     plt.close()
#
# # importance
# list_imp = []
# for t_duration_ind in range(len(list_duration)):
#     sp = list_sp[t_duration_ind]
#     imp = sp[sp > p(list_duration[t_duration_ind])].shape[0] / float(sp.shape[0])
#     list_imp.append(imp)
#
# plt.scatter(list_duration, list_imp)
# plt.title(ch)
# plt.xscale('log')
# plt.ylabel("rho")
# plt.xlabel("duration [sec]")
# plt.savefig("imp_{}.png".format(ch))
# plt.close()
#
#
#
#
#
#
#

#
#
# gps = 1262368238
# gps1 = 1262369268
# d = 64
# fft_length = 4
# gwdata = TimeSeries.get('L1:GDS-CALIB_STRAIN', gps-d/2., gps+d/2.)
# duration = 3.45
# on = gwdata.crop(end=gwdata.span[0] + duration)
# spectrum = gwdata.asd(fft_length, fft_length /2, method='median')
# interpolated_spectrum = spectrum.interpolate(round(1/duration, 5))
#
#
# on_asd = on.asd(duration, round(duration /2, 3), method='median')
#
#
# print("spectrum df= {} Hz".format(spectrum.df))
# print("interpolated spectrum df= {} Hz".format(interpolated_spectrum.df))
# print("on spectrum df= {} Hz".format(on_asd.df))
#






#
# gps = 1262368238 # glitchy
# gps1 = 1262369268 # quiet
# d = 64
# gwdata = TimeSeries.get('L1:GDS-CALIB_STRAIN', gps-d/2., gps+d/2.)
# gwdata1 = TimeSeries.get('L1:GDS-CALIB_STRAIN', gps1-d/2., gps1+d/2.)
# gwdata2 = TimeSeries.get('L1:GDS-CALIB_STRAIN', gps1-1., gps1+1.)
#
#
# list_fft_length = [16, 8, 4]
# list_fft_length = [4]
# list_method = ['median', 'welch']
# #list_method = ['median']
# for fft_length in list_fft_length:
#     for method in list_method:
#         if method == 'welch':
#             ls = '--'
#         else:
#             ls = '-'
#         spectrum = gwdata.asd(fft_length, fft_length /2, method=method)
#         spectrum1 = gwdata1.asd(fft_length, fft_length / 2, method=method)
#         spectrum2 = gwdata2.asd(fft_length, fft_length / 2, method=method)
#         plt.plot(spectrum.frequencies, spectrum.value, label='fft length = {}, method = {}'.format(fft_length, method), alpha=0.5)
#         plt.plot(spectrum1.frequencies, spectrum1.value, label='fft length = {}, method = {}, quiet'.format(fft_length, method), alpha=0.5)
#         plt.plot(spectrum2.frequencies, spectrum2.value, label='fft length = {}, method = {}, quiet, short'.format(fft_length, method), alpha=0.5)
# plt.title("duration = {} sec".format(d))
# plt.xlim(40, 4000)
# plt.ylabel(r'GW strain ASD [strain$/\sqrt{\mathrm{Hz}}$]')
# plt.ylim(1e-24, 3e-21)
# plt.xscale('log')
# plt.yscale('log')
# plt.legend()
# plt.savefig("spectrum.png")
# plt.close()


# #
# # import time as TIME
# # from gwpy.timeseries import TimeSeries, TimeSeriesDict
# # gps = 1175755932.81
# # pre_background_start = gps - 64
# # fol_background_end = gps + 64
# # IFO = "L1"
# # number_process = 5
# # channels = ["L1:PEM-EX_MAG_EBAY_SUSRACK_QUAD_SUM_DQ"]
# # t0 = TIME.time()
# # full_timeseriesDict = TimeSeriesDict.find(channels, pre_background_start, fol_background_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process)
# # t1 = TIME.time()
# # print("Time taken in {} sec".format(t1 - t0))
#
#
#
#
#
# import pandas as pd
# import os
# import sys
# import matplotlib
# matplotlib.use("Agg")
# import matplotlib.pyplot as plt
#
#
#
# #------------
# # find the quiet time
# #--------------
#
# path = '/home/kentaro.mogushi/longlived/MachineLearningJointPisaUM/dataset/GravityspyDataSet/O2/gspy-db-20180813.csv'
# df = pd.read_csv(path)
#
# df = df.drop_duplicates(subset=["GPStime", "duration"]).reset_index()
# df = df.assign(reserved_time_start=df['GPStime'] - df['duration'] / 2. - 0.1)
# df = df.assign(reserved_time_end=df['GPStime'] + df['duration'] / 2. + 0.1)
#
#
#
#
#
#
#
#
#
#
#
#
# #
# # for i in range(df.shape[0])[200000:200100]:
# #     plt.plot([df.iloc[i]["reserved_time_start"] - res_start.min(), df.iloc[i]["reserved_time_end"] - res_start.min()], [i,i])
# #
# # plt.ylabel("Glitch index")
# # plt.xlabel("on-source window")
# # plt.savefig("glitch.png")
# # plt.close()
#
#
# c = res_start[1:] - res_end[:-1]
#
#
#
#
# df_q = pd.DataFrame()
#
#
# plt.hist(c, bins=50)
# plt.ylabel("Count")
# plt.xlabel("Quiet time length [sec]")
# plt.savefig("quiet_time_length.png")
# plt.close()
#
#
#
#