From 9d9a92c912cbac5f2081c095c7d3d01d7cd046b8 Mon Sep 17 00:00:00 2001 From: jdestin <jeremy.destin@inra.fr> Date: Wed, 27 May 2020 14:58:24 +0200 Subject: [PATCH 1/3] fix:Create a folder to store the logs --- scripts/harvest.sh | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/scripts/harvest.sh b/scripts/harvest.sh index 07054373..b774b717 100755 --- a/scripts/harvest.sh +++ b/scripts/harvest.sh @@ -119,6 +119,22 @@ for FILE in $(find ${DATA_DIR} -name "*.json"); do gzip $FILE done +LOG_DIR=${DATA_DIR}"log/" +export TMP_LOG_FILE + +if [[ -d ${LOG_DIR} ]] +then + rm -r "${LOG_DIR}" + mkdir "${LOG_DIR}" +else + mkdir "${LOG_DIR}" +fi + + +basename_dir() { "$1$(basename "$(dirname "$2")")/$3" ; } + +export -f basename_dir + for DOCUMENT_TYPE in ${DOCUMENT_TYPES}; do echo && echo -e "${BOLD}Manage ${DOCUMENT_TYPE} documents...${NC}" INDEX_PATTERN=$(echo "faidare_${DOCUMENT_TYPE}_${ENV}" | sed -E "s/([a-z])([A-Z])/\1-\2/" | tr '[:upper:]' '[:lower:]') @@ -141,6 +157,12 @@ for DOCUMENT_TYPE in ${DOCUMENT_TYPES}; do echo -e "* Index documents into ${ES_HOST}:${ES_PORT}/${INDEX_NAME} indice..." { parallel -j 2 --bar " + + echo test1 + #echo '{= s:.*/[^_]*_:sub/:; =}' + echo {=s:/*::;=} + echo test2 + curl -s -H 'Content-Type: application/x-ndjson' -H 'Content-Encoding: gzip' -H 'Accept-Encoding: gzip' -XPOST ${ES_HOST}:${ES_PORT}/${INDEX_NAME}/_bulk --data-binary '@{}' > {.}.log.gz" \ ::: $(find ${DATA_DIR} -name "${DOCUMENT_TYPE}-*.json.gz") } || { @@ -173,6 +195,7 @@ for DOCUMENT_TYPE in ${DOCUMENT_TYPES}; do echo -e "${ORANGE}Expected ${COUNT_EXTRACTED_DOCS} documents but got ${COUNT_INDEXED_DOCS} indexed documents.${NC}" exit 1; fi + sleep 5 # Add aliases ALIAS_PATTERN="${INDEX_PATTERN}-group*" -- GitLab From 612f9997c20a86e03a03a8041692304aa465e480 Mon Sep 17 00:00:00 2001 From: jdestin <jeremy.destin@inra.fr> Date: Thu, 30 Jul 2020 16:19:09 +0200 Subject: [PATCH 2/3] Rebase on master. Create and copy the log in a dedicated repository. GNP-5861 --- scripts/harvest.sh | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/scripts/harvest.sh b/scripts/harvest.sh index b774b717..bda1f2e6 100755 --- a/scripts/harvest.sh +++ b/scripts/harvest.sh @@ -119,21 +119,29 @@ for FILE in $(find ${DATA_DIR} -name "*.json"); do gzip $FILE done -LOG_DIR=${DATA_DIR}"log/" -export TMP_LOG_FILE +LOG_DIR="${DATA_DIR%/}/log" if [[ -d ${LOG_DIR} ]] then rm -r "${LOG_DIR}" - mkdir "${LOG_DIR}" -else - mkdir "${LOG_DIR}" fi +export ES_HOST ES_PORT INDEX_NAME LOG_DIR -basename_dir() { "$1$(basename "$(dirname "$2")")/$3" ; } +process_file() { + file=$(basename "$1") + logfile="${file%.*}.log.gz" + source=$(basename "$(dirname "$1")") -export -f basename_dir + if ! [[ -d "${LOG_DIR}/$source" ]] + then + mkdir -p "${LOG_DIR}/$source" + fi + + curl -s -H 'Content-Type: application/x-ndjson' -H 'Content-Encoding: gzip' -H 'Accept-Encoding: gzip' -XPOST "${ES_HOST}:${ES_PORT}/${INDEX_NAME}/_bulk" --data-binary @"$1" > "${LOG_DIR}/$source/$logfile" +} + +export -f process_file for DOCUMENT_TYPE in ${DOCUMENT_TYPES}; do echo && echo -e "${BOLD}Manage ${DOCUMENT_TYPE} documents...${NC}" @@ -156,15 +164,7 @@ for DOCUMENT_TYPE in ${DOCUMENT_TYPES}; do INDEX_NAME="${INDEX_PATTERN}-d"$(date +%s) echo -e "* Index documents into ${ES_HOST}:${ES_PORT}/${INDEX_NAME} indice..." { - parallel -j 2 --bar " - - echo test1 - #echo '{= s:.*/[^_]*_:sub/:; =}' - echo {=s:/*::;=} - echo test2 - - curl -s -H 'Content-Type: application/x-ndjson' -H 'Content-Encoding: gzip' -H 'Accept-Encoding: gzip' -XPOST ${ES_HOST}:${ES_PORT}/${INDEX_NAME}/_bulk --data-binary '@{}' > {.}.log.gz" \ - ::: $(find ${DATA_DIR} -name "${DOCUMENT_TYPE}-*.json.gz") + parallel -j 2 --bar process_file ::: $(find "${DATA_DIR}" -name "${DOCUMENT_TYPE}-*.json.gz") } || { code=$? echo -e "${RED}ERROR: a problem occurred when trying to index data with parallel program.${NC}" -- GitLab From bcc5c1d9b29fb58c6431cfaf0ad44e1710247d18 Mon Sep 17 00:00:00 2001 From: jdestin <jeremy.destin@inra.fr> Date: Thu, 30 Jul 2020 16:52:03 +0200 Subject: [PATCH 3/3] Rename indexing repository. GNP-5861 --- scripts/harvest.sh | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/scripts/harvest.sh b/scripts/harvest.sh index bda1f2e6..86379b8c 100755 --- a/scripts/harvest.sh +++ b/scripts/harvest.sh @@ -52,7 +52,7 @@ check_acknowledgment() { local MSG=$2 echo $LOG | jq '.acknowledged? == true' | grep 'true' >/dev/null || { echo -e "${RED}ERROR: a problem occurred when trying to ${MSG}.${NC}" - echo -e "${ORANGE}$(echo ${LOG})${NC}" + echo -e "${ORANGE}$(echo "${LOG}")${NC}" exit 1; } } @@ -61,7 +61,7 @@ check_acknowledgment() { MISSING_COUNT=0 PROGRAMS="gzip parallel jq" for PROGRAM in ${PROGRAMS}; do - command -v ${PROGRAM} >/dev/null || { + command -v "${PROGRAM}" >/dev/null || { echo -e "${ORANGE}Program ${PROGRAM} is missing, cannot continue...${NC}" ((MISSING_COUNT += 1)) } @@ -102,24 +102,24 @@ if [ ! -d "${DATA_DIR}" ]; then echo -e "${RED}ERROR: Mandatory parameter 'jsonDir' is missing!${NC}" echo && help fi -if [ $(find ${DATA_DIR} -name "*.json" | wc -l) -le 0 ] && [ $(find ${DATA_DIR} -name "*.json.gz" | wc -l) -le 0 ]; then +if [ $(find "${DATA_DIR}" -name "*.json" | wc -l) -le 0 ] && [ $(find "${DATA_DIR}" -name "*.json.gz" | wc -l) -le 0 ]; then echo -e "${RED}ERROR: The JSON directory ${DATA_DIR} contains no JSON files!${NC}" echo && help fi [ "${DOCUMENT_TYPES}" == "all" ] && DOCUMENT_TYPES="${ALL_DOCUMENT_TYPES}" for DOCUMENT_TYPE in ${DOCUMENT_TYPES}; do - if [ $(find ${DATA_DIR} -name "${DOCUMENT_TYPE}*.json" | wc -l) -le 0 ] && [ $(find ${DATA_DIR} -name "${DOCUMENT_TYPE}*.json.gz" | wc -l) -le 0 ]; then + if [ $(find "${DATA_DIR}" -name "${DOCUMENT_TYPE}*.json" | wc -l) -le 0 ] && [ $(find ${DATA_DIR} -name "${DOCUMENT_TYPE}*.json.gz" | wc -l) -le 0 ]; then echo -e "${ORANGE}WARNING: The JSON directory ${DATA_DIR} contains no ${DOCUMENT_TYPE} document. Type will be ignored!${NC}" DOCUMENT_TYPES=$(echo "${DOCUMENT_TYPES}" | sed "s/ *${DOCUMENT_TYPE} */ /g") fi done # Compress JSON files -for FILE in $(find ${DATA_DIR} -name "*.json"); do - gzip $FILE +for FILE in $(find "${DATA_DIR}" -name "*.json"); do + gzip "$FILE" done -LOG_DIR="${DATA_DIR%/}/log" +LOG_DIR="${DATA_DIR%/}/indexing-log" if [[ -d ${LOG_DIR} ]] then @@ -155,8 +155,8 @@ for DOCUMENT_TYPE in ${DOCUMENT_TYPES}; do \"index_patterns\": [\"${INDEX_PATTERN}-*\"], \"order\": 101, \"mappings\": - $(cat ${BASEDIR}/../backend/src/test/resources/fr/inra/urgi/faidare/repository/es/setup/index/${DOCUMENT_TYPE}_mapping.json), - \"settings\": $(cat ${BASEDIR}/../backend/src/test/resources/fr/inra/urgi/faidare/repository/es/setup/index/settings.json) + $(cat "${BASEDIR}"/../backend/src/test/resources/fr/inra/urgi/faidare/repository/es/setup/index/${DOCUMENT_TYPE}_mapping.json), + \"settings\": $(cat "${BASEDIR}"/../backend/src/test/resources/fr/inra/urgi/faidare/repository/es/setup/index/settings.json) }") check_acknowledgment "${LOG}" "create template" @@ -170,7 +170,7 @@ for DOCUMENT_TYPE in ${DOCUMENT_TYPES}; do echo -e "${RED}ERROR: a problem occurred when trying to index data with parallel program.${NC}" exit $code } - parallel "gunzip -c {} | jq '.errors' | grep -q true && echo -e '${ORANGE}ERROR found in {}${NC}' >> ${TMP_FILE} ;" ::: $(find ${DATA_DIR} -name "${DOCUMENT_TYPE}-*.log.gz") + parallel "gunzip -c {} | jq '.errors' | grep -q true && echo -e '${ORANGE}ERROR found in {}${NC}' >> ${TMP_FILE} ;" ::: $(find "${DATA_DIR}" -name "${DOCUMENT_TYPE}-*.log.gz") if [ -f "${TMP_FILE}" ] && [ -s "${TMP_FILE}" ]; then echo -e "${RED}ERROR: a problem occurred when trying to index data into ${ES_HOST}:${ES_PORT}/${INDEX_NAME} indice.${NC}" echo -e "${ORANGE}$(cat ${TMP_FILE})${NC}" -- GitLab