## ---- echo = FALSE, include = FALSE, message = FALSE--------------------------
knitr::opts_chunk$set(
  warning = FALSE,
  collapse = TRUE,
  dpi = 120,
  comment = "#>"
)


## ----setup, warning = FALSE---------------------------------------------------
library(amerifluxr)
library(data.table)
library(pander)

## ----echo=TRUE, results = "asis"----------------------------------------------
# get a full list of sites with general info
sites <- amf_site_info()
sites_dt <- data.table::as.data.table(sites)

pander::pandoc.table(sites_dt[c(1:3), ])


## ----results = "asis"---------------------------------------------------------
# total number of registered sites
pander::pandoc.table(sites_dt[, .N])

# total number of sites with available data
pander::pandoc.table(sites_dt[!is.na(DATA_START), .N])

# get number of sites with available data, grouped by data use policy
pander::pandoc.table(sites_dt[!is.na(DATA_START), .N, by = .(DATA_POLICY)])


## ----results = "asis"---------------------------------------------------------
# get a summary table of sites grouped by IGBP
pander::pandoc.table(sites_dt[, .N, by = "IGBP"])

# get a summary table of sites with available data, & grouped by IGBP
pander::pandoc.table(sites_dt[!is.na(DATA_START), .N, by = "IGBP"])

# get a summary table of sites with available data, 
#  & grouped by data use policy & IGBP
pander::pandoc.table(sites_dt[!is.na(DATA_START), .N, by = .(IGBP, DATA_POLICY)][order(IGBP)])


## ----results = "asis"---------------------------------------------------------

# get a list of cropland and grassland sites with available data,
#  shared under CC-BY-4.0 data policy,
#  located within 30-50 degree N in latitude,
# returned a site list with site ID, name, data starting/ending year
crop_ls <- sites_dt[IGBP %in% c("CRO", "GRA") &
                      !is.na(DATA_START) &
                      LOCATION_LAT > 30 &
                      LOCATION_LAT < 50 &
                      DATA_POLICY == "CCBY4.0",
                    .(SITE_ID, SITE_NAME, DATA_START, DATA_END)]
pander::pandoc.table(crop_ls[c(1:10),])


## ----results = "asis"---------------------------------------------------------
# get data availability for selected sites
metadata_aval <- data.table::as.data.table(amf_list_metadata())
pander::pandoc.table(metadata_aval[c(1:3), c(1:10)])

## ----results = "asis"---------------------------------------------------------
metadata_aval_sub <- as.data.table(amf_list_metadata(site_set = crop_ls$SITE_ID))

# down-select cropland & grassland sites by interested BADM group,
#  e.g., canopy height (GRP_HEIGHTC)
crop_ls2 <- metadata_aval_sub[GRP_HEIGHTC > 0, .(SITE_ID, GRP_HEIGHTC)][order(-GRP_HEIGHTC)]
pander::pandoc.table(crop_ls2[c(1:10), ])

## ----results = "asis"---------------------------------------------------------
# get data availability for selected sites
data_aval <- data.table::as.data.table(amf_list_data(site_set = crop_ls2$SITE_ID))
pander::pandoc.table(data_aval[c(1:10), ])

## ----results = "asis"---------------------------------------------------------
# down-select cropland & grassland sites based on the available wind speed (WS) and 
# friction velocity (USTAR) data in 2015-2018, regardless their qualifiers
data_aval_sub <- data_aval[data_aval$BASENAME %in% c("WS","USTAR"),
                           .(SITE_ID, BASENAME, Y2015, Y2016, Y2017, Y2018)]

# calculate mean availability of WS and USTAR in each site and each year
data_aval_sub <- data_aval_sub[, lapply(.SD, mean), 
                               by = .(SITE_ID),
                               .SDcols = c("Y2015", "Y2016", "Y2017", "Y2018")]

# sub-select sites that have WS and USTAR data for > 75%
#  during 2015-2018
crop_ls3 <- data_aval_sub[(Y2015 + Y2016 + Y2017 + Y2018) / 4 > 0.75]
pander::pandoc.table(crop_ls3)


## ----results = "asis"---------------------------------------------------------

# down-select cropland & grassland sites by available wind speed (WS) data,
#  mean availability of WS during 2015-2018
data_aval_sub2 <- data_aval[data_aval$BASENAME %in% c("WS"),
                            .(SITE_ID, VARIABLE, Y2015_2018 = (Y2015 + Y2016 + Y2017 + Y2018)/4)]

# calculate number of WS variables per site, for sites that 
#  have any WS data during 2015-2018
data_aval_sub2 <- data_aval_sub2[Y2015_2018 > 0, .(.N, Y2015_2018 = mean(Y2015_2018)), .(SITE_ID)]
pander::pandoc.table(crop_ls4 <- data_aval_sub2[N > 1, ])


## ----eval = FALSE-------------------------------------------------------------
#  #### not evaluated so to reduce vignette size
#  # plot data availability for WS & USTAR
#  #  for selected sites in 2015-2018
#  amf_plot_datayear(
#    site_set = crop_ls4$SITE_ID,
#    var_set = c("WS", "USTAR"),
#    nonfilled_only = TRUE,
#    year_set = c(2015:2018)
#  )
#  

## ----results = "asis"---------------------------------------------------------
## get data summary for selected sites & variables
data_sum <- amf_summarize_data(site_set = crop_ls3$SITE_ID,
                     var_set = c("WS", "USTAR"))
pander::pandoc.table(data_sum[c(1:10), ])


## ----eval = FALSE-------------------------------------------------------------
#  #### not evaluated so to reduce vignette size
#  ## plot data summary of USTAR for selected sites,
#  amf_plot_datasummary(
#    site_set = crop_ls3$SITE_ID,
#    var_set = c("USTAR")
#  )
#  

## ----eval = FALSE-------------------------------------------------------------
#  #### not evaluated so to reduce vignette size
#  ## plot data summary of WS for selected sites,
#  #  including clustering information
#  amf_plot_datasummary(
#    site_set = crop_ls3$SITE_ID,
#    var_set = c("WS"),
#    show_cluster = TRUE
#  )
#