--- /dev/null
+From d94cd0d3eec33e4290d7ca81918f5ac61444886e Mon Sep 17 00:00:00 2001
+From: Dumitru Ceara <dceara@redhat.com>
+Date: Tue, 26 Apr 2022 12:37:08 +0200
+Subject: [PATCH] ovsdb-idl: Support write-only-changed IDL monitor mode.
+
+At a first glance, change tracking should never be allowed for
+write-only columns. However, some clients (e.g., ovn-northd) that are
+mostly exclusive writers of a database, use change tracking to avoid
+duplicating the IDL row records into a local cache when implementing
+incremental processing.
+
+The default behavior of the IDL is to automatically turn a write-only
+column into a read-write column whenever the client enables change
+tracking for that column.
+
+For the afore mentioned clients, this becomes a performance issue.
+Commit 1cc618c32524 ("ovsdb-idl: Fix atomicity of writes that don't
+change a column's value.") explains why writes that don't change a
+column's value cannot be optimized out early if the column is
+read/write.
+
+Furthermore, if there is at least one record in any table that
+changed during a transaction, then *all* records that have been
+written are added to the transaction, even if their values didn't
+change. If there are many such rows (e.g., like in ovn-northd's
+case) this incurs a significant overhead because:
+a. the client has to build this large transaction
+b. the transaction has to be sent over the network
+c. the server needs to parse this (mostly) no-op update
+
+We now introduce new IDL APIs allowing users to set a new monitoring
+mode flag, OVSDB_IDL_WRITE_CHANGED_ONLY, to indicate to the IDL that the
+atomicity constraints may be relaxed and written columns that don't
+change value can be skipped from the current transaction.
+
+We benchmarked ovn-northd performance when using this new mode
+against NB and SB databases taken from ovn-kubernetes scale tests.
+We noticed that when a minor change is performed to the Northbound
+database (e.g., NB_Global.nb_cfg is incremented) the time it takes to
+build the Southbound transaction becomes negligible (vs ~1.5 seconds
+before this change).
+
+End-to-end ovn-kubernetes scale tests on 120-node clusters also show
+significant reduction of latency to bring up pods; both average and P99
+latency decreased by ~30%.
+
+Acked-by: Han Zhou <hzhou@ovn.org>
+Signed-off-by: Dumitru Ceara <dceara@redhat.com>
+Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
+---
+ lib/ovsdb-idl.c | 60 +++++++++++++++++++++++++++++++++++++++++-----
+ lib/ovsdb-idl.h | 16 +++++++++++--
+ tests/ovsdb-idl.at | 31 +++++++++++++++++++++++-
+ tests/test-ovsdb.c | 18 ++++++++++----
+ 5 files changed, 115 insertions(+), 14 deletions(-)
+
+--- a/lib/ovsdb-idl.c
++++ b/lib/ovsdb-idl.c
+@@ -1398,6 +1398,45 @@ ovsdb_idl_track_clear(struct ovsdb_idl *
+ {
+ ovsdb_idl_track_clear__(idl, false);
+ }
++
++/* Sets or clears (depending on 'enable') OVSDB_IDL_WRITE_CHANGED_ONLY
++ * for 'column' in 'idl', ensuring that the column will be included in a
++ * transaction only if its value has actually changed locally. Normally
++ * read/write columns that are written to are always included in the
++ * transaction but, in specific cases, when the application doesn't
++ * require atomicity of writes across different columns, the ones that
++ * don't change value may be skipped.
++ *
++ * This function should be called between ovsdb_idl_create() and
++ * the first call to ovsdb_idl_run().
++ */
++void
++ovsdb_idl_set_write_changed_only(struct ovsdb_idl *idl,
++ const struct ovsdb_idl_column *column,
++ bool enable)
++{
++ if (enable) {
++ *ovsdb_idl_get_mode(idl, column) |= OVSDB_IDL_WRITE_CHANGED_ONLY;
++ } else {
++ *ovsdb_idl_get_mode(idl, column) &= ~OVSDB_IDL_WRITE_CHANGED_ONLY;
++ }
++}
++
++/* Helper function to wrap calling ovsdb_idl_set_write_changed_only() for
++ * all columns that are part of 'idl'.
++ */
++void
++ovsdb_idl_set_write_changed_only_all(struct ovsdb_idl *idl, bool enable)
++{
++ for (size_t i = 0; i < idl->class_->n_tables; i++) {
++ const struct ovsdb_idl_table_class *tc = &idl->class_->tables[i];
++
++ for (size_t j = 0; j < tc->n_columns; j++) {
++ const struct ovsdb_idl_column *column = &tc->columns[j];
++ ovsdb_idl_set_write_changed_only(idl, column, enable);
++ }
++ }
++}
+ \f
+ static void
+ log_parse_update_error(struct ovsdb_error *error)
+@@ -3502,6 +3541,8 @@ ovsdb_idl_txn_write__(const struct ovsdb
+ {
+ struct ovsdb_idl_row *row = CONST_CAST(struct ovsdb_idl_row *, row_);
+ const struct ovsdb_idl_table_class *class;
++ unsigned char column_mode;
++ bool optimize_rewritten;
+ size_t column_idx;
+ bool write_only;
+
+@@ -3512,12 +3553,15 @@ ovsdb_idl_txn_write__(const struct ovsdb
+
+ class = row->table->class_;
+ column_idx = column - class->columns;
+- write_only = row->table->modes[column_idx] == OVSDB_IDL_MONITOR;
++ column_mode = row->table->modes[column_idx];
++ write_only = column_mode == OVSDB_IDL_MONITOR;
++ optimize_rewritten =
++ write_only || (column_mode & OVSDB_IDL_WRITE_CHANGED_ONLY);
++
+
+ ovs_assert(row->new_datum != NULL);
+ ovs_assert(column_idx < class->n_columns);
+- ovs_assert(row->old_datum == NULL ||
+- row->table->modes[column_idx] & OVSDB_IDL_MONITOR);
++ ovs_assert(row->old_datum == NULL || column_mode & OVSDB_IDL_MONITOR);
+
+ if (row->table->idl->verify_write_only && !write_only) {
+ VLOG_ERR("Bug: Attempt to write to a read/write column (%s:%s) when"
+@@ -3535,9 +3579,13 @@ ovsdb_idl_txn_write__(const struct ovsdb
+ * different value in that column since we read it. (But if a whole
+ * transaction only does writes of existing values, without making any real
+ * changes, we will drop the whole transaction later in
+- * ovsdb_idl_txn_commit().) */
+- if (write_only && ovsdb_datum_equals(ovsdb_idl_read(row, column),
+- datum, &column->type)) {
++ * ovsdb_idl_txn_commit().)
++ *
++ * The application may choose to bypass this restriction and always
++ * optimize by setting OVSDB_IDL_WRITE_CHANGED_ONLY.
++ */
++ if (optimize_rewritten && ovsdb_datum_equals(ovsdb_idl_read(row, column),
++ datum, &column->type)) {
+ goto discard_datum;
+ }
+
+--- a/lib/ovsdb-idl.h
++++ b/lib/ovsdb-idl.h
+@@ -158,7 +158,7 @@ bool ovsdb_idl_server_has_column(const s
+ * IDL will change the value returned by ovsdb_idl_get_seqno() when the
+ * column's value changes in any row.
+ *
+- * The possible mode combinations are:
++ * Typical mode combinations are:
+ *
+ * - 0, for a column that a client doesn't care about. This is the default
+ * for every column in every table, if the client passes false for
+@@ -181,10 +181,17 @@ bool ovsdb_idl_server_has_column(const s
+ * - (OVSDB_IDL_MONITOR | OVSDB_IDL_ALERT | OVSDB_IDL_TRACK), for a column
+ * that a client wants to track using the change tracking
+ * ovsdb_idl_track_get_*() functions.
++ *
++ * - (OVSDB_IDL_MONITOR | OVSDB_IDL_ALERT | OVSDB_IDL_WRITE_CHANGED_ONLY)
++ * is similar to (OVSDB_IDL_MONITOR | OVSDB_IDL_ALERT) except that it
++ * only adds a written column to a transaction if the column's value
++ * has actually changed.
+ */
+ #define OVSDB_IDL_MONITOR (1 << 0) /* Replicate this column? */
+ #define OVSDB_IDL_ALERT (1 << 1) /* Alert client when column changes? */
+-#define OVSDB_IDL_TRACK (1 << 2)
++#define OVSDB_IDL_TRACK (1 << 2) /* Track column changes? */
++#define OVSDB_IDL_WRITE_CHANGED_ONLY \
++ (1 << 3) /* Write back only changed columns? */
+
+ void ovsdb_idl_add_column(struct ovsdb_idl *, const struct ovsdb_idl_column *);
+ void ovsdb_idl_add_table(struct ovsdb_idl *,
+@@ -233,6 +240,11 @@ bool ovsdb_idl_track_is_updated(const st
+ const struct ovsdb_idl_column *column);
+ void ovsdb_idl_track_clear(struct ovsdb_idl *);
+
++void ovsdb_idl_set_write_changed_only(struct ovsdb_idl *idl,
++ const struct ovsdb_idl_column *column,
++ bool enable);
++void ovsdb_idl_set_write_changed_only_all(struct ovsdb_idl *idl, bool enable);
++
+ \f
+ /* Reading the database replica. */
+
+--- a/tests/ovsdb-idl.at
++++ b/tests/ovsdb-idl.at
+@@ -94,6 +94,20 @@ m4_define([OVSDB_CHECK_IDL_C],
+ OVSDB_SERVER_SHUTDOWN
+ AT_CLEANUP])
+
++# same as OVSDB_CHECK_IDL but uses OVSDB_IDL_WRITE_CHANGED_ONLY.
++m4_define([OVSDB_CHECK_IDL_WRITE_CHANGED_ONLY_C],
++ [AT_SETUP([$1 - write-changed-only - C])
++ AT_KEYWORDS([ovsdb server idl positive $5])
++ AT_CHECK([ovsdb_start_idltest])
++ m4_if([$2], [], [],
++ [AT_CHECK([ovsdb-client transact unix:socket $2], [0], [ignore], [ignore])])
++ AT_CHECK([test-ovsdb '-vPATTERN:console:test-ovsdb|%c|%m' -vjsonrpc -t10 idl unix:socket $3],
++ [0], [stdout], [ignore])
++ AT_CHECK([sort stdout | uuidfilt]m4_if([$6],,, [[| $6]]),
++ [0], [$4])
++ OVSDB_SERVER_SHUTDOWN
++ AT_CLEANUP])
++
+ # same as OVSDB_CHECK_IDL but uses tcp.
+ m4_define([OVSDB_CHECK_IDL_TCP_C],
+ [AT_SETUP([$1 - C - tcp])
+@@ -264,6 +278,7 @@ m4_define([OVSDB_CHECK_IDL_SSL_PY],
+
+ m4_define([OVSDB_CHECK_IDL],
+ [OVSDB_CHECK_IDL_C($@)
++ OVSDB_CHECK_IDL_WRITE_CHANGED_ONLY_C($@)
+ OVSDB_CHECK_IDL_TCP_C($@)
+ OVSDB_CHECK_IDL_TCP6_C($@)
+ OVSDB_CHECK_IDL_PY($@)
+@@ -1202,8 +1217,22 @@ m4_define([OVSDB_CHECK_IDL_TRACK_C],
+ OVSDB_SERVER_SHUTDOWN
+ AT_CLEANUP])
+
++m4_define([OVSDB_CHECK_IDL_TRACK_WRITE_CHANGED_ONLY_C],
++ [AT_SETUP([$1 - write-changed-only - C])
++ AT_KEYWORDS([ovsdb server idl tracking positive $5])
++ AT_CHECK([ovsdb_start_idltest])
++ m4_if([$2], [], [],
++ [AT_CHECK([ovsdb-client transact unix:socket $2], [0], [ignore], [ignore])])
++ AT_CHECK([test-ovsdb '-vPATTERN:console:test-ovsdb|%c|%m' -vjsonrpc -t10 -c -w idl unix:socket $3],
++ [0], [stdout], [ignore])
++ AT_CHECK([sort stdout | uuidfilt]m4_if([$6],,, [[| $6]]),
++ [0], [$4])
++ OVSDB_SERVER_SHUTDOWN
++ AT_CLEANUP])
++
+ m4_define([OVSDB_CHECK_IDL_TRACK],
+- [OVSDB_CHECK_IDL_TRACK_C($@)])
++ [OVSDB_CHECK_IDL_TRACK_C($@)
++ OVSDB_CHECK_IDL_TRACK_WRITE_CHANGED_ONLY_C($@)])
+
+ OVSDB_CHECK_IDL_TRACK([track, simple idl, initially populated],
+ [['["idltest",
+--- a/tests/test-ovsdb.c
++++ b/tests/test-ovsdb.c
+@@ -56,6 +56,7 @@
+ VLOG_DEFINE_THIS_MODULE(test_ovsdb);
+
+ struct test_ovsdb_pvt_context {
++ bool write_changed_only;
+ bool track;
+ };
+
+@@ -91,6 +92,7 @@ parse_options(int argc, char *argv[], st
+ {"timeout", required_argument, NULL, 't'},
+ {"verbose", optional_argument, NULL, 'v'},
+ {"change-track", optional_argument, NULL, 'c'},
++ {"write-changed-only", optional_argument, NULL, 'w'},
+ {"magic", required_argument, NULL, OPT_MAGIC},
+ {"no-rename-open-files", no_argument, NULL, OPT_NO_RENAME_OPEN_FILES},
+ {"help", no_argument, NULL, 'h'},
+@@ -125,6 +127,10 @@ parse_options(int argc, char *argv[], st
+ pvt->track = true;
+ break;
+
++ case 'w':
++ pvt->write_changed_only = true;
++ break;
++
+ case OPT_MAGIC:
+ magic = optarg;
+ break;
+@@ -2681,6 +2687,7 @@ update_conditions(struct ovsdb_idl *idl,
+ static void
+ do_idl(struct ovs_cmdl_context *ctx)
+ {
++ struct test_ovsdb_pvt_context *pvt = ctx->pvt;
+ struct jsonrpc *rpc;
+ struct ovsdb_idl *idl;
+ unsigned int next_cond_seqno = 0;
+@@ -2690,9 +2697,6 @@ do_idl(struct ovs_cmdl_context *ctx)
+ int step = 0;
+ int error;
+ int i;
+- bool track;
+-
+- track = ((struct test_ovsdb_pvt_context *)(ctx->pvt))->track;
+
+ idl = ovsdb_idl_create(ctx->argv[1], &idltest_idl_class, true, true);
+ ovsdb_idl_set_leader_only(idl, false);
+@@ -2709,10 +2713,14 @@ do_idl(struct ovs_cmdl_context *ctx)
+ rpc = NULL;
+ }
+
+- if (track) {
++ if (pvt->track) {
+ ovsdb_idl_track_add_all(idl);
+ }
+
++ if (pvt->write_changed_only) {
++ ovsdb_idl_set_write_changed_only_all(idl, true);
++ }
++
+ setvbuf(stdout, NULL, _IONBF, 0);
+
+ symtab = ovsdb_symbol_table_create();
+@@ -2770,7 +2778,7 @@ do_idl(struct ovs_cmdl_context *ctx)
+ }
+
+ /* Print update. */
+- if (track) {
++ if (pvt->track) {
+ print_idl_track(idl, step++, terse);
+ ovsdb_idl_track_clear(idl);
+ } else {