/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi.sink;

import org.apache.hudi.client.HoodieFlinkWriteClient;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.common.model.HoodieTableType;
import org.apache.hudi.common.model.WriteOperationType;
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.util.CommitUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.ValidationUtils;
import org.apache.hudi.configuration.FlinkOptions;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.sink.event.CommitAckEvent;
import org.apache.hudi.sink.event.WriteMetadataEvent;
import org.apache.hudi.sink.utils.CoordinatorExecutor;
import org.apache.hudi.sink.utils.HiveSyncContext;
import org.apache.hudi.sink.utils.NonThrownExecutor;
import org.apache.hudi.util.StreamerUtil;

import org.apache.flink.annotation.VisibleForTesting;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.runtime.jobgraph.OperatorID;
import org.apache.flink.runtime.operators.coordination.OperatorCoordinator;
import org.apache.flink.runtime.operators.coordination.OperatorEvent;
import org.apache.flink.runtime.operators.coordination.TaskNotRunningException;
import org.jetbrains.annotations.Nullable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.Serializable;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.CompletionException;
import java.util.stream.Collectors;
import java.util.stream.IntStream;

import static org.apache.hudi.util.StreamerUtil.initTableIfNotExists;

/**
 * {@link OperatorCoordinator} for {@link StreamWriteFunction}.
 *
 * <p>This coordinator starts a new instant when a new checkpoint starts. It commits the instant when all the
 * operator tasks write the buffer successfully for a round of checkpoint.
 *
 * <p>If there is no data for a round of checkpointing, it resets the events buffer and returns early.
 *
 * @see StreamWriteFunction for the work flow and semantics
 */
public class StreamWriteOperatorCoordinator
    implements OperatorCoordinator {
  private static final Logger LOG = LoggerFactory.getLogger(StreamWriteOperatorCoordinator.class);

  /**
   * Config options.
   */
  private final Configuration conf;

  /**
   * Coordinator context.
   */
  private final Context context;

  /**
   * Write client.
   */
  private transient HoodieFlinkWriteClient writeClient;

  /**
   * Current REQUESTED instant, for validation.
   */
  private volatile String instant = WriteMetadataEvent.BOOTSTRAP_INSTANT;

  /**
   * Event buffer for one round of checkpointing. When all the elements are non-null and have the same
   * write instant, then the instant succeed and we can commit it.
   */
  private transient WriteMetadataEvent[] eventBuffer;

  /**
   * Task number of the operator.
   */
  private final int parallelism;

  /**
   * A single-thread executor to handle all the asynchronous jobs of the coordinator.
   */
  private CoordinatorExecutor executor;

  /**
   * A single-thread executor to handle asynchronous hive sync.
   */
  private NonThrownExecutor hiveSyncExecutor;

  /**
   * Context that holds variables for asynchronous hive sync.
   */
  private HiveSyncContext hiveSyncContext;

  /**
   * A single-thread executor to handle metadata table sync.
   */
  private NonThrownExecutor metadataSyncExecutor;

  /**
   * The table state.
   */
  private transient TableState tableState;

  /**
   * Constructs a StreamingSinkOperatorCoordinator.
   *
   * @param conf    The config options
   * @param context The coordinator context
   */
  public StreamWriteOperatorCoordinator(
      Configuration conf,
      Context context) {
    this.conf = conf;
    this.context = context;
    this.parallelism = context.currentParallelism();
  }

  @Override
  public void start() throws Exception {
    // initialize event buffer
    reset();
    this.writeClient = StreamerUtil.createWriteClient(conf);
    this.tableState = TableState.create(conf);
    // init table, create it if not exists.
    initTableIfNotExists(this.conf);
    // start the executor
    this.executor = new CoordinatorExecutor(this.context, LOG);
    // start the executor if required
    if (tableState.syncHive) {
      initHiveSync();
    }
    if (tableState.syncMetadata) {
      initMetadataSync();
    }
  }

  @Override
  public void close() throws Exception {
    // teardown the resource
    if (writeClient != null) {
      writeClient.close();
    }
    if (executor != null) {
      executor.close();
    }
    if (hiveSyncExecutor != null) {
      hiveSyncExecutor.close();
    }
    this.eventBuffer = null;
  }

  @Override
  public void checkpointCoordinator(long checkpointId, CompletableFuture<byte[]> result) {
    executor.execute(
        () -> {
          try {
            result.complete(new byte[0]);
          } catch (Throwable throwable) {
            // when a checkpoint fails, throws directly.
            result.completeExceptionally(
                new CompletionException(
                    String.format("Failed to checkpoint Instant %s for source %s",
                        this.instant, this.getClass().getSimpleName()), throwable));
          }
        }, "taking checkpoint %d", checkpointId
    );
  }

  @Override
  public void notifyCheckpointComplete(long checkpointId) {
    executor.execute(
        () -> {
          // for streaming mode, commits the ever received events anyway,
          // the stream write task snapshot and flush the data buffer synchronously in sequence,
          // so a successful checkpoint subsumes the old one(follows the checkpoint subsuming contract)
          final boolean committed = commitInstant(this.instant);
          if (committed) {
            // if async compaction is on, schedule the compaction
            if (tableState.scheduleCompaction) {
              writeClient.scheduleCompaction(Option.empty());
            }
            // start new instant.
            startInstant();
            // sync Hive if is enabled
            syncHiveIfEnabled();
            // sync metadata if is enabled
            syncMetadataIfEnabled();
          }
        }, "commits the instant %s", this.instant
    );
  }

  @Override
  public void resetToCheckpoint(long checkpointID, byte[] checkpointData) {
    // no operation
  }

  @Override
  public void handleEventFromOperator(int i, OperatorEvent operatorEvent) {
    executor.execute(
        () -> {
          // no event to handle
          ValidationUtils.checkState(operatorEvent instanceof WriteMetadataEvent,
              "The coordinator can only handle WriteMetaEvent");
          WriteMetadataEvent event = (WriteMetadataEvent) operatorEvent;
          if (event.isBootstrap()) {
            handleBootstrapEvent(event);
          } else if (event.isEndInput()) {
            handleEndInputEvent(event);
          } else {
            handleWriteMetaEvent(event);
          }
        }, "handle write metadata event for instant %s", this.instant
    );
  }

  @Override
  public void subtaskFailed(int i, @Nullable Throwable throwable) {
    // reset the event
    this.eventBuffer[i] = null;
    LOG.warn("Reset the event for task [" + i + "]", throwable);
  }

  @Override
  public void subtaskReset(int i, long l) {
    // no operation
  }

  // -------------------------------------------------------------------------
  //  Utilities
  // -------------------------------------------------------------------------

  private void initHiveSync() {
    this.hiveSyncExecutor = new NonThrownExecutor(LOG, true);
    this.hiveSyncContext = HiveSyncContext.create(conf);
  }

  private void syncHiveIfEnabled() {
    if (tableState.syncHive) {
      this.hiveSyncExecutor.execute(this::syncHive, "sync hive metadata for instant %s", this.instant);
    }
  }

  /**
   * Sync hoodie table metadata to Hive metastore.
   */
  public void syncHive() {
    hiveSyncContext.hiveSyncTool().syncHoodieTable();
  }

  private void initMetadataSync() {
    this.metadataSyncExecutor = new NonThrownExecutor(LOG, true);
  }

  /**
   * Sync the write metadata to the metadata table.
   */
  private void syncMetadataIfEnabled() {
    if (tableState.syncMetadata) {
      this.metadataSyncExecutor.execute(this::syncMetadata,
          "sync metadata table for instant %s", this.instant);
    }
  }

  /**
   * Sync the write metadata to the metadata table.
   */
  private void syncMetadata() {
    this.writeClient.syncTableMetadata();
  }

  private void reset() {
    this.eventBuffer = new WriteMetadataEvent[this.parallelism];
  }

  /**
   * Checks the buffer is ready to commit.
   */
  private boolean allEventsReceived() {
    return Arrays.stream(eventBuffer)
        .allMatch(event -> event != null && event.isReady(this.instant));
  }

  private void addEventToBuffer(WriteMetadataEvent event) {
    if (this.eventBuffer[event.getTaskID()] != null) {
      this.eventBuffer[event.getTaskID()].mergeWith(event);
    } else {
      this.eventBuffer[event.getTaskID()] = event;
    }
  }

  private void startInstant() {
    final String instant = HoodieActiveTimeline.createNewInstantTime();
    this.writeClient.startCommitWithTime(instant, tableState.commitAction);
    this.instant = instant;
    this.writeClient.transitionRequestedToInflight(tableState.commitAction, this.instant);
    LOG.info("Create instant [{}] for table [{}] with type [{}]", this.instant,
        this.conf.getString(FlinkOptions.TABLE_NAME), conf.getString(FlinkOptions.TABLE_TYPE));
  }

  /**
   * Initializes the instant.
   *
   * <p>Recommits the last inflight instant if the write metadata checkpoint successfully
   * but was not committed due to some rare cases.
   *
   * <p>Starts a new instant, a writer can not flush data buffer
   * until it finds a new inflight instant on the timeline.
   */
  private void initInstant(String instant) {
    HoodieTimeline completedTimeline =
        StreamerUtil.createMetaClient(conf).getActiveTimeline().filterCompletedInstants();
    executor.execute(() -> {
      if (instant.equals("") || completedTimeline.containsInstant(instant)) {
        // the last instant committed successfully
        reset();
      } else {
        LOG.info("Recommit instant {}", instant);
        commitInstant(instant);
      }
      if (tableState.syncMetadata) {
        // initialize metadata table first if enabled
        // condition: the data set timeline has committed instants
        syncMetadata();
      }
      // starts a new instant
      startInstant();
    }, "initialize instant %s", instant);
  }

  private void handleBootstrapEvent(WriteMetadataEvent event) {
    this.eventBuffer[event.getTaskID()] = event;
    if (Arrays.stream(eventBuffer).allMatch(evt -> evt != null && evt.isBootstrap())) {
      // start to initialize the instant.
      initInstant(event.getInstantTime());
    }
  }

  private void handleEndInputEvent(WriteMetadataEvent event) {
    addEventToBuffer(event);
    if (allEventsReceived()) {
      // start to commit the instant.
      commitInstant(this.instant);
      // sync Hive if is enabled in batch mode.
      syncHiveIfEnabled();
      // sync metadata if is enabled in batch mode.
      syncMetadataIfEnabled();
    }
  }

  private void handleWriteMetaEvent(WriteMetadataEvent event) {
    // the write task does not block after checkpointing(and before it receives a checkpoint success event),
    // if it checkpoints succeed then flushes the data buffer again before this coordinator receives a checkpoint
    // success event, the data buffer would flush with an older instant time.
    ValidationUtils.checkState(
        HoodieTimeline.compareTimestamps(this.instant, HoodieTimeline.GREATER_THAN_OR_EQUALS, event.getInstantTime()),
        String.format("Receive an unexpected event for instant %s from task %d",
            event.getInstantTime(), event.getTaskID()));

    addEventToBuffer(event);
  }

  /**
   * The coordinator reuses the instant if there is no data for this round of checkpoint,
   * sends the commit ack events to unblock the flushing.
   */
  private void sendCommitAckEvents() {
    CompletableFuture<?>[] futures = IntStream.range(0, this.parallelism)
        .mapToObj(taskID -> {
          try {
            return this.context.sendEvent(CommitAckEvent.getInstance(), taskID);
          } catch (TaskNotRunningException e) {
            throw new HoodieException("Error while sending commit ack event to task [" + taskID + "]", e);
          }
        }).toArray(CompletableFuture<?>[]::new);
    try {
      CompletableFuture.allOf(futures).get();
    } catch (Exception e) {
      throw new HoodieException("Error while waiting for the commit ack events to finish sending", e);
    }
  }

  /**
   * Commits the instant.
   *
   * @return true if the write statuses are committed successfully.
   */
  private boolean commitInstant(String instant) {
    if (Arrays.stream(eventBuffer).allMatch(Objects::isNull)) {
      // The last checkpoint finished successfully.
      return false;
    }

    List<WriteStatus> writeResults = Arrays.stream(eventBuffer)
        .filter(Objects::nonNull)
        .map(WriteMetadataEvent::getWriteStatuses)
        .flatMap(Collection::stream)
        .collect(Collectors.toList());

    if (writeResults.size() == 0) {
      // No data has written, reset the buffer and returns early
      reset();
      // Send commit ack event to the write function to unblock the flushing
      sendCommitAckEvents();
      return false;
    }
    doCommit(instant, writeResults);
    return true;
  }

  /**
   * Performs the actual commit action.
   */
  @SuppressWarnings("unchecked")
  private void doCommit(String instant, List<WriteStatus> writeResults) {
    // commit or rollback
    long totalErrorRecords = writeResults.stream().map(WriteStatus::getTotalErrorRecords).reduce(Long::sum).orElse(0L);
    long totalRecords = writeResults.stream().map(WriteStatus::getTotalRecords).reduce(Long::sum).orElse(0L);
    boolean hasErrors = totalErrorRecords > 0;

    if (!hasErrors || this.conf.getBoolean(FlinkOptions.IGNORE_FAILED)) {
      HashMap<String, String> checkpointCommitMetadata = new HashMap<>();
      if (hasErrors) {
        LOG.warn("Some records failed to merge but forcing commit since commitOnErrors set to true. Errors/Total="
            + totalErrorRecords + "/" + totalRecords);
      }

      final Map<String, List<String>> partitionToReplacedFileIds = tableState.isOverwrite
          ? writeClient.getPartitionToReplacedFileIds(tableState.operationType, writeResults)
          : Collections.emptyMap();
      boolean success = writeClient.commit(instant, writeResults, Option.of(checkpointCommitMetadata),
          tableState.commitAction, partitionToReplacedFileIds);
      if (success) {
        reset();
        LOG.info("Commit instant [{}] success!", instant);
      } else {
        throw new HoodieException(String.format("Commit instant [%s] failed!", instant));
      }
    } else {
      LOG.error("Error when writing. Errors/Total=" + totalErrorRecords + "/" + totalRecords);
      LOG.error("The first 100 error messages");
      writeResults.stream().filter(WriteStatus::hasErrors).limit(100).forEach(ws -> {
        LOG.error("Global error for partition path {} and fileID {}: {}",
            ws.getGlobalError(), ws.getPartitionPath(), ws.getFileId());
        if (ws.getErrors().size() > 0) {
          ws.getErrors().forEach((key, value) -> LOG.trace("Error for key:" + key + " and value " + value));
        }
      });
      // Rolls back instant
      writeClient.rollback(instant);
      throw new HoodieException(String.format("Commit instant [%s] failed and rolled back !", instant));
    }
  }

  @VisibleForTesting
  public WriteMetadataEvent[] getEventBuffer() {
    return eventBuffer;
  }

  @VisibleForTesting
  public String getInstant() {
    return instant;
  }

  @VisibleForTesting
  @SuppressWarnings("rawtypes")
  public HoodieFlinkWriteClient getWriteClient() {
    return writeClient;
  }

  @VisibleForTesting
  public Context getContext() {
    return context;
  }

  @VisibleForTesting
  public void setExecutor(CoordinatorExecutor executor) throws Exception {
    if (this.executor != null) {
      this.executor.close();
    }
    this.executor = executor;
  }

  @VisibleForTesting
  public void setMetadataSyncExecutor(NonThrownExecutor executor) throws Exception {
    if (this.metadataSyncExecutor != null) {
      this.metadataSyncExecutor.close();
    }
    this.metadataSyncExecutor = executor;
  }

  // -------------------------------------------------------------------------
  //  Inner Class
  // -------------------------------------------------------------------------

  /**
   * Provider for {@link StreamWriteOperatorCoordinator}.
   */
  public static class Provider implements OperatorCoordinator.Provider {
    private final OperatorID operatorId;
    private final Configuration conf;

    public Provider(OperatorID operatorId, Configuration conf) {
      this.operatorId = operatorId;
      this.conf = conf;
    }

    @Override
    public OperatorID getOperatorId() {
      return this.operatorId;
    }

    @Override
    public OperatorCoordinator create(Context context) {
      return new StreamWriteOperatorCoordinator(this.conf, context);
    }
  }

  /**
   * Remember some table state variables.
   */
  private static class TableState implements Serializable {
    private static final long serialVersionUID = 1L;

    final WriteOperationType operationType;
    final String commitAction;
    final boolean isOverwrite;
    final boolean scheduleCompaction;
    final boolean syncHive;
    final boolean syncMetadata;

    private TableState(Configuration conf) {
      this.operationType = WriteOperationType.fromValue(conf.getString(FlinkOptions.OPERATION));
      this.commitAction = CommitUtils.getCommitActionType(this.operationType,
          HoodieTableType.valueOf(conf.getString(FlinkOptions.TABLE_TYPE).toUpperCase(Locale.ROOT)));
      this.isOverwrite = WriteOperationType.isOverwrite(this.operationType);
      this.scheduleCompaction = StreamerUtil.needsScheduleCompaction(conf);
      this.syncHive = conf.getBoolean(FlinkOptions.HIVE_SYNC_ENABLED);
      this.syncMetadata = conf.getBoolean(FlinkOptions.METADATA_ENABLED);
    }

    public static TableState create(Configuration conf) {
      return new TableState(conf);
    }
  }
}
