jchjava
diff --git a/‎src/buffer_cache/mirrored/config.hpp
Lines changed: 1 addition & 1 deletion b/‎src/buffer_cache/mirrored/config.hpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/clustering/generic/multi_throttling_client.cc
Lines changed: 31 additions & 7 deletions b/‎src/clustering/generic/multi_throttling_client.cc
Lines changed: 31 additions & 7 deletions
diff --git a/‎src/clustering/generic/multi_throttling_client.hpp
Lines changed: 7 additions & 0 deletions b/‎src/clustering/generic/multi_throttling_client.hpp
Lines changed: 7 additions & 0 deletions
diff --git a/‎src/clustering/generic/multi_throttling_server.hpp
Lines changed: 100 additions & 47 deletions b/‎src/clustering/generic/multi_throttling_server.hpp
Lines changed: 100 additions & 47 deletions
diff --git a/‎src/clustering/immediate_consistency/branch/broadcaster.cc
Lines changed: 1 addition & 9 deletions b/‎src/clustering/immediate_consistency/branch/broadcaster.cc
Lines changed: 1 addition & 9 deletions
diff --git a/‎src/clustering/immediate_consistency/branch/broadcaster.hpp
Lines changed: 0 additions & 6 deletions b/‎src/clustering/immediate_consistency/branch/broadcaster.hpp
Lines changed: 0 additions & 6 deletions
@@ -15,7 +15,7 @@ struct mirrored_cache_config_t {
         max_size = 8 * MEGABYTE; // This should be overwritten
             // at a place where more information about the system and use of the cache is available.
         flush_timer_ms = DEFAULT_FLUSH_TIMER_MS;
-        max_dirty_size = DEFAULT_UNSAVED_DATA_LIMIT;
+        max_dirty_size = max_size / 2; // This should be overwritten
         flush_dirty_size = 0;
         max_concurrent_flushes = DEFAULT_MAX_CONCURRENT_FLUSHES;
         io_priority_reads = CACHE_READS_IO_PRIORITY;
 
@@ -23,6 +23,7 @@ multi_throttling_client_t<request_type, inner_client_business_card_type>::ticket
         break;
     case state_acquired_ticket:
         parent->free_tickets++;
+        parent->pump_free_tickets();
         break;
     case state_used_ticket:
         break;
@@ -39,6 +40,7 @@ multi_throttling_client_t<request_type, inner_client_business_card_type>::multi_
         signal_t *interruptor) :
     mailbox_manager(mm),
     free_tickets(0),
+    to_relinquish(0),
     give_tickets_mailbox(mailbox_manager,
         boost::bind(&multi_throttling_client_t::on_give_tickets, this, _1)),
     reclaim_tickets_mailbox(mailbox_manager,
@@ -110,23 +112,45 @@ boost::optional<boost::optional<registrar_business_card_t<typename multi_throttl
 
 template <class request_type, class inner_client_business_card_type>
 void multi_throttling_client_t<request_type, inner_client_business_card_type>::on_give_tickets(int count) {
-    while (count > 0 && !ticket_queue.empty()) {
+    free_tickets += count;
+    pump_free_tickets();
+}
+
+template <class request_type, class inner_client_business_card_type>
+void multi_throttling_client_t<request_type, inner_client_business_card_type>::pump_free_tickets() {
+    // Hand out tickets to the waiter
+    while (free_tickets > 0 && !ticket_queue.empty()) {
         ticket_acq_t *lucky_winner = ticket_queue.head();
         ticket_queue.remove(lucky_winner);
         lucky_winner->state = ticket_acq_t::state_acquired_ticket;
+        free_tickets--;
         lucky_winner->pulse();
-        count--;
     }
-    free_tickets += count;
+    // If we didn't need all tickets, see if we are still supposed to return some
+    // of them to the server.
+    try_to_relinquish_tickets();
 }
 
 template <class request_type, class inner_client_business_card_type>
 void multi_throttling_client_t<request_type, inner_client_business_card_type>::on_reclaim_tickets(int count) {
-    int to_relinquish = std::min(count, free_tickets);
-    if (to_relinquish > 0) {
-        free_tickets -= to_relinquish;
+    /* We must try out best to relinquish as many tickets as the server asked us
+       to. Otherwise the target tickets can drift increasingly far away
+       from the actual tickets we have.
+       To do this, we keep track of how many more tickets we are supposed to
+       relinquish and then return them to the server as soon as we have something left
+       to return. */
+    to_relinquish += count;
+    try_to_relinquish_tickets();
+}
+
+template <class request_type, class inner_client_business_card_type>
+void multi_throttling_client_t<request_type, inner_client_business_card_type>::try_to_relinquish_tickets() {
+    int can_relinquish = std::min(to_relinquish, free_tickets);
+    if (can_relinquish > 0) {
+        to_relinquish -= can_relinquish;
+        free_tickets -= can_relinquish;
         coro_t::spawn_sometime(boost::bind(&multi_throttling_client_t<request_type, inner_client_business_card_type>::relinquish_tickets_blocking, this,
-                                           to_relinquish,
+                                           can_relinquish,
                                            auto_drainer_t::lock_t(&drainer)));
     }
 }
 
@@ -53,15 +53,22 @@ class multi_throttling_client_t {
 
     void on_give_tickets(int count);
 
+    void pump_free_tickets();
+
     void on_reclaim_tickets(int count);
 
+    /* Relinquishes as many tickets as possible considering `free_tickets`, up to a
+       maximum of `to_relinquish`. */
+    void try_to_relinquish_tickets();
+
     void relinquish_tickets_blocking(int count, auto_drainer_t::lock_t keepalive);
 
     mailbox_manager_t *const mailbox_manager;
 
     promise_t<server_business_card_t> intro_promise;
 
     int free_tickets;
+    int to_relinquish;
     intrusive_list_t<ticket_acq_t> ticket_queue;
 
     auto_drainer_t drainer;
 
@@ -3,8 +3,10 @@
 #define CLUSTERING_GENERIC_MULTI_THROTTLING_SERVER_HPP_
 
 #include <algorithm>
+#include <map>
 
 #include "arch/timing.hpp"
+#include "containers/priority_queue.hpp"
 #include "clustering/generic/multi_throttling_metadata.hpp"
 #include "clustering/generic/registrar.hpp"
 #include "rpc/mailbox/typed.hpp"
@@ -23,7 +25,7 @@ class multi_throttling_server_t :
             int capacity) :
         mailbox_manager(mm),
         user_data(ud),
-        total_tickets(capacity), free_tickets(capacity),
+        goal_capacity(capacity), total_tickets(capacity), free_tickets(capacity),
         reallocate_timer(reallocate_interval_ms, this),
         registrar(mailbox_manager, this)
         { }
@@ -35,6 +37,7 @@ class multi_throttling_server_t :
 
 private:
     static const int reallocate_interval_ms = 1000;
+    static const int fair_fraction_denom = 5;
 
     class client_t :
             public intrusive_list_node_t<client_t>,
@@ -66,11 +69,13 @@ class multi_throttling_server_t :
                  server_business_card_t(request_mailbox->get_address(),
                                         relinquish_tickets_mailbox->get_address()));
             parent->clients.push_back(this);
+            parent->adjust_total_tickets();
             parent->recompute_allocations();
         }
 
         ~client_t() {
             parent->clients.remove(this);
+            parent->adjust_total_tickets();
             parent->recompute_allocations();
             request_mailbox.reset();
             relinquish_tickets_mailbox.reset();
@@ -182,11 +187,11 @@ class multi_throttling_server_t :
         /* We divide the total number of tickets into two pools. The first pool
         is distributed evenly among all the clients. The second pool is
         distributed in proportion to the clients' QPS. */
-        static const double fair_fraction = 0.1;
-        int fair_tickets = static_cast<int>(total_tickets * fair_fraction);
+        int fair_tickets = std::max(static_cast<int>(clients.size()),
+                total_tickets / fair_fraction_denom);
         int qps_tickets = total_tickets - fair_tickets;
         int total_qps = 0;
-        for (client_t *c = clients.head(); c; c = clients.next(c)) {
+        for (client_t *c = clients.head(); c != NULL; c = clients.next(c)) {
             total_qps += c->estimate_qps();
         }
         if (clients.size() == 0) {
@@ -197,7 +202,7 @@ class multi_throttling_server_t :
             tickets will be distributed, but that's OK. */
             total_qps = 1;
         }
-        for (client_t *c = clients.head(); c; c = clients.next(c)) {
+        for (client_t *c = clients.head(); c != NULL; c = clients.next(c)) {
             /* This math isn't exact, but it's OK if the target tickets of all
             the clients don't add up to `total_tickets`. */
             c->set_target_tickets(fair_tickets / clients.size() +
@@ -206,69 +211,117 @@ class multi_throttling_server_t :
         redistribute_tickets();
     }
 
+    void adjust_total_tickets() {
+        /* If new clients connect, we adapt the total_tickets number, rather than
+           just leaving it at goal_capacity.
+           This serves two purposes:
+           1. It makes sure that when a new client connect, we always have some
+           free_ticket available to give to that client (note that clients here mean
+           cluster nodes, not application clients).
+           Otherwise new clients would have to wait until we send a relinquish_tickets
+           message to one of the existing clients, then wait until that other client
+           returns some of its tickets to us, which we could only then pass on to the
+           newly connected client. The result would be a delay until a new client
+           could actually process any query which we would like to avoid.
+           2. If we have more clients than total_tickets/fair_fraction_denom, we would
+           end up assigning 0 tickets to some clients. Those clients could never
+           process any query. */
+
+        /* So fair_tickets in recompute_allocations() is at least 1 per client. */
+        int per_client_capacity = fair_fraction_denom;
+        int new_total_tickets = goal_capacity + clients.size() * per_client_capacity;
+        /* Note: This can temporarily make free_tickets negative */
+        int diff = new_total_tickets - total_tickets;
+        free_tickets += diff;
+        total_tickets = new_total_tickets;
+    }
+
     void return_tickets(int tickets) {
         free_tickets += tickets;
-        guarantee(free_tickets <= total_tickets);
         redistribute_tickets();
     }
 
     void redistribute_tickets() {
-        static const int chunk_size = 100;
-        static const int min_reasonable_tickets = 10;
-        client_t *neediest;
-        int gift_size;
-
-        /* First, look for a client with a critically low number of tickets.
-           They get priority in tickets. This prevents starvation. */
-        while (free_tickets > 0) {
-            gift_size = -1;
-            neediest = NULL;
-            for (client_t *c = clients.head(); c; c = clients.next(c)) {
-                if (c->get_current_tickets() < min_reasonable_tickets && c->get_current_tickets() < c->get_target_tickets()) {
-                    if (!neediest || c->get_current_tickets() < neediest->get_current_tickets()) {
-                        neediest = c;
-                        gift_size = std::min(c->get_target_tickets() - c->get_current_tickets(), free_tickets);
-                    }
+        if (free_tickets <= 0 || clients.empty()) {
+            return;
+        }
+
+        const int min_chunk_size = ceil_divide(100, static_cast<int>(clients.size()));
+        const int min_reasonable_tickets = 10;
+
+        {
+            /* We cannot risk a client disconnecting while we are in here. That would
+               invalidate the pointers in tickets_to_give. */
+            ASSERT_NO_CORO_WAITING;
+            std::map<client_t *, int> tickets_to_give;
+
+            /* First, look for clients with a critically low number of tickets.
+               They get priority in tickets. This prevents starvation. */
+            std::vector<client_t *> critical_clients;
+            critical_clients.reserve(clients.size());
+            for (client_t *c = clients.head(); c != NULL; c = clients.next(c)) {
+                if (c->get_current_tickets() < min_reasonable_tickets
+                    && c->get_current_tickets() < c->get_target_tickets()) {
+                    critical_clients.push_back(c);
                 }
             }
-
-            if (!neediest) {
-                break;
+            /* Distribute the available tickets among critical clients, up to a
+               gift size of `min_reasonable_tickets`. As a consequence of the
+               `ceil_divide()` in here we still set gift_size to 1 even if we don't
+               have enough free tickets to give at least 1 to every critical client.
+               That way we will at least give something to the first couple
+               of clients.*/
+            if (!critical_clients.empty()) {
+                int gift_size_for_critical_clients = std::min(min_reasonable_tickets,
+                        ceil_divide(free_tickets, critical_clients.size()));
+                for (auto itr = critical_clients.begin(); itr != critical_clients.end(); ++itr) {
+                    int tickets_client_actually_wants = std::max(0,
+                        (*itr)->get_target_tickets() - (*itr)->get_current_tickets());
+                    int gift_size = std::min(free_tickets,
+                        std::min(tickets_client_actually_wants, gift_size_for_critical_clients));
+                    free_tickets -= gift_size;
+                    tickets_to_give[*itr] += gift_size;
+                }
             }
-            guarantee(gift_size >= 0);
-            free_tickets -= gift_size;
-            neediest->give_tickets(gift_size);
-        }
 
-        /* Next, look for clients with a large difference between their target
-           number of tickets and their current number of tickets. But if the
-           difference is less than `chunk_size`, don't send any tickets at all
-           to avoid flooding the network with many small ticket updates. */
-        while (free_tickets > chunk_size) {
-            gift_size = -1;
-            neediest = NULL;
-            for (client_t *c = clients.head(); c; c = clients.next(c)) {
-                int need_size = c->get_target_tickets() - c->get_current_tickets();
-                if (need_size > chunk_size && (!neediest || need_size > neediest->get_target_tickets() - neediest->get_current_tickets())) {
-                    neediest = c;
-                    gift_size = chunk_size;
+            /* Next, look for clients with a large difference between their target
+               number of tickets and their current number of tickets. But if the
+               difference is less than `min_chunk_size`, don't send any tickets at all
+               to avoid flooding the network with many small ticket updates. */
+            priority_queue_t<std::pair<int, client_t *> > needy_clients;
+            for (client_t *c = clients.head(); c != NULL; c = clients.next(c)) {
+                int need_size = c->get_target_tickets()
+                        - c->get_current_tickets()
+                        - tickets_to_give[c];
+                if (need_size >= min_chunk_size) {
+                    needy_clients.push(std::pair<int, client_t *>(need_size, c));
+                }
+            }
+            while (free_tickets >= min_chunk_size && !needy_clients.empty()) {
+                std::pair<int, client_t *> neediest = needy_clients.pop();
+                free_tickets -= min_chunk_size;
+                tickets_to_give[neediest.second] += min_chunk_size;
+                neediest.first -= min_chunk_size;
+                if (neediest.first >= min_chunk_size) {
+                    /* Re-insert the client so it gets more tickets later */
+                    needy_clients.push(neediest);
                 }
             }
 
-            if (!neediest) {
-                break;
+            /* Now actually send the tickets to the clients */
+            for (auto itr = tickets_to_give.begin(); itr != tickets_to_give.end(); ++itr) {
+                if (itr->second > 0) {
+                    itr->first->give_tickets(itr->second);
+                }
             }
-            guarantee(gift_size >= 0);
-            free_tickets -= gift_size;
-            neediest->give_tickets(gift_size);
         }
     }
 
     mailbox_manager_t *const mailbox_manager;
     user_data_type user_data;
 
     intrusive_list_t<client_t> clients;
-    int total_tickets, free_tickets;
+    int goal_capacity, total_tickets, free_tickets;
 
     repeating_timer_t reallocate_timer;
 
 
@@ -17,10 +17,6 @@
 #include "rpc/semilattice/view/field.hpp"
 #include "rpc/semilattice/view/member.hpp"
 
-template <class protocol_t>
-const int broadcaster_t<protocol_t>::MAX_OUTSTANDING_WRITES =
-    listener_t<protocol_t>::MAX_OUTSTANDING_WRITES_FROM_BROADCASTER;
-
 template <class protocol_t>
 broadcaster_t<protocol_t>::write_callback_t::write_callback_t() : write(NULL) { }
 
@@ -44,7 +40,6 @@ broadcaster_t<protocol_t>::broadcaster_t(mailbox_manager_t *mm,
       mailbox_manager(mm),
       branch_id(generate_uuid()),
       branch_history_manager(bhm),
-      enforce_max_outstanding_writes(MAX_OUTSTANDING_WRITES),
       registrar(mailbox_manager, this)
 
 {
@@ -136,14 +131,12 @@ template <class protocol_t>
 class broadcaster_t<protocol_t>::incomplete_write_t : public home_thread_mixin_debug_only_t {
 public:
     incomplete_write_t(broadcaster_t *p, const typename protocol_t::write_t &w, transition_timestamp_t ts, write_callback_t *cb) :
-        write(w), timestamp(ts), callback(cb), sem_acq(&p->enforce_max_outstanding_writes), parent(p), incomplete_count(0) { }
+        write(w), timestamp(ts), callback(cb), parent(p), incomplete_count(0) { }
 
     const typename protocol_t::write_t write;
     const transition_timestamp_t timestamp;
     write_callback_t *callback;
 
-    semaphore_assertion_t::acq_t sem_acq;
-
 private:
     friend class incomplete_write_ref_t;
 
@@ -565,7 +558,6 @@ void broadcaster_t<protocol_t>::end_write(boost::shared_ptr<incomplete_write_t>
         guarantee(newest_complete_timestamp == removed_write->timestamp.timestamp_before());
         newest_complete_timestamp = removed_write->timestamp.timestamp_after();
     }
-    write->sem_acq.reset();
     if (write->callback) {
         guarantee(write->callback->write == write.get());
         write->callback->write = NULL;
 
@@ -39,11 +39,6 @@ class broadcaster_t : public home_thread_mixin_debug_only_t {
     class incomplete_write_t;
 
 public:
-    /* If the number of calls to `spawn_write()` minus the number of writes that
-    have completed is equal to `MAX_OUTSTANDING_WRITES`, it's illegal to call
-    `spawn_write()` again. */
-    static const int MAX_OUTSTANDING_WRITES;
-
     class write_callback_t {
     public:
         write_callback_t();
@@ -176,7 +171,6 @@ class broadcaster_t : public home_thread_mixin_debug_only_t {
     std::list<boost::shared_ptr<incomplete_write_t> > incomplete_writes;
     state_timestamp_t current_timestamp, newest_complete_timestamp;
     order_checkpoint_t order_checkpoint;
-    semaphore_assertion_t enforce_max_outstanding_writes;
 
     std::map<dispatchee_t *, auto_drainer_t::lock_t> dispatchees;
     intrusive_list_t<dispatchee_t> readable_dispatchees;