HistogramDiffIndex.java

  1. /*
  2.  * Copyright (C) 2010, Google Inc. and others
  3.  *
  4.  * This program and the accompanying materials are made available under the
  5.  * terms of the Eclipse Distribution License v. 1.0 which is available at
  6.  * https://www.eclipse.org/org/documents/edl-v10.php.
  7.  *
  8.  * SPDX-License-Identifier: BSD-3-Clause
  9.  */

  10. package org.eclipse.jgit.diff;

  11. import org.eclipse.jgit.internal.JGitText;

  12. /**
  13.  * Support {@link HistogramDiff} by computing occurrence counts of elements.
  14.  * <p>
  15.  * Each element in the range being considered is put into a hash table, tracking
  16.  * the number of times that distinct element appears in the sequence. Once all
  17.  * elements have been inserted from sequence A, each element of sequence B is
  18.  * probed in the hash table and the longest common subsequence with the lowest
  19.  * occurrence count in A is used as the result.
  20.  *
  21.  * @param <S>
  22.  *            type of the base sequence.
  23.  */
  24. final class HistogramDiffIndex<S extends Sequence> {
  25.     private static final int REC_NEXT_SHIFT = 28 + 8;

  26.     private static final int REC_PTR_SHIFT = 8;

  27.     private static final int REC_PTR_MASK = (1 << 28) - 1;

  28.     private static final int REC_CNT_MASK = (1 << 8) - 1;

  29.     private static final int MAX_PTR = REC_PTR_MASK;

  30.     private static final int MAX_CNT = (1 << 8) - 1;

  31.     private final int maxChainLength;

  32.     private final HashedSequenceComparator<S> cmp;

  33.     private final HashedSequence<S> a;

  34.     private final HashedSequence<S> b;

  35.     private final Edit region;

  36.     /** Keyed by {@link #hash(HashedSequence, int)} for {@link #recs} index. */
  37.     private final int[] table;

  38.     /** Number of low bits to discard from a key to index {@link #table}. */
  39.     private final int keyShift;

  40.     /**
  41.      * Describes a unique element in sequence A.
  42.      *
  43.      * The records in this table are actually 3-tuples of:
  44.      * <ul>
  45.      * <li>index of next record in this table that has same hash code</li>
  46.      * <li>index of first element in this occurrence chain</li>
  47.      * <li>occurrence count for this element (length of locs list)</li>
  48.      * </ul>
  49.      *
  50.      * The occurrence count is capped at {@link #MAX_CNT}, as the field is only
  51.      * a few bits wide. Elements that occur more frequently will have their
  52.      * count capped.
  53.      */
  54.     private long[] recs;

  55.     /** Number of elements in {@link #recs}; also is the unique element count. */
  56.     private int recCnt;

  57.     /**
  58.      * For {@code ptr}, {@code next[ptr - ptrShift]} has subsequent index.
  59.      *
  60.      * For the sequence element {@code ptr}, the value stored at location
  61.      * {@code next[ptr - ptrShift]} is the next occurrence of the exact same
  62.      * element in the sequence.
  63.      *
  64.      * Chains always run from the lowest index to the largest index. Therefore
  65.      * the array will store {@code next[1] = 2}, but never {@code next[2] = 1}.
  66.      * This allows a chain to terminate with {@code 0}, as {@code 0} would never
  67.      * be a valid next element.
  68.      *
  69.      * The array is sized to be {@code region.getLengthA()} and element indexes
  70.      * are converted to array indexes by subtracting {@link #ptrShift}, which is
  71.      * just a cached version of {@code region.beginA}.
  72.      */
  73.     private int[] next;

  74.     /**
  75.      * For element {@code ptr} in A, index of the record in {@link #recs} array.
  76.      *
  77.      * The record at {@code recs[recIdx[ptr - ptrShift]]} is the record
  78.      * describing all occurrences of the element appearing in sequence A at
  79.      * position {@code ptr}. The record is needed to get the occurrence count of
  80.      * the element, or to locate all other occurrences of that element within
  81.      * sequence A. This index provides constant-time access to the record, and
  82.      * avoids needing to scan the hash chain.
  83.      */
  84.     private int[] recIdx;

  85.     /** Value to subtract from element indexes to key {@link #next} array. */
  86.     private int ptrShift;

  87.     private Edit lcs;

  88.     private int cnt;

  89.     private boolean hasCommon;

  90.     HistogramDiffIndex(int maxChainLength, HashedSequenceComparator<S> cmp,
  91.             HashedSequence<S> a, HashedSequence<S> b, Edit r) {
  92.         this.maxChainLength = maxChainLength;
  93.         this.cmp = cmp;
  94.         this.a = a;
  95.         this.b = b;
  96.         this.region = r;

  97.         if (region.endA >= MAX_PTR)
  98.             throw new IllegalArgumentException(
  99.                     JGitText.get().sequenceTooLargeForDiffAlgorithm);

  100.         final int sz = r.getLengthA();
  101.         final int tableBits = tableBits(sz);
  102.         table = new int[1 << tableBits];
  103.         keyShift = 32 - tableBits;
  104.         ptrShift = r.beginA;

  105.         recs = new long[Math.max(4, sz >>> 3)];
  106.         next = new int[sz];
  107.         recIdx = new int[sz];
  108.     }

  109.     Edit findLongestCommonSequence() {
  110.         if (!scanA())
  111.             return null;

  112.         lcs = new Edit(0, 0);
  113.         cnt = maxChainLength + 1;

  114.         for (int bPtr = region.beginB; bPtr < region.endB;)
  115.             bPtr = tryLongestCommonSequence(bPtr);

  116.         return hasCommon && maxChainLength < cnt ? null : lcs;
  117.     }

  118.     private boolean scanA() {
  119.         // Scan the elements backwards, inserting them into the hash table
  120.         // as we go. Going in reverse places the earliest occurrence of any
  121.         // element at the start of the chain, so we consider earlier matches
  122.         // before later matches.
  123.         //
  124.         SCAN: for (int ptr = region.endA - 1; region.beginA <= ptr; ptr--) {
  125.             final int tIdx = hash(a, ptr);

  126.             int chainLen = 0;
  127.             for (int rIdx = table[tIdx]; rIdx != 0;) {
  128.                 final long rec = recs[rIdx];
  129.                 if (cmp.equals(a, recPtr(rec), a, ptr)) {
  130.                     // ptr is identical to another element. Insert it onto
  131.                     // the front of the existing element chain.
  132.                     //
  133.                     int newCnt = recCnt(rec) + 1;
  134.                     if (MAX_CNT < newCnt)
  135.                         newCnt = MAX_CNT;
  136.                     recs[rIdx] = recCreate(recNext(rec), ptr, newCnt);
  137.                     next[ptr - ptrShift] = recPtr(rec);
  138.                     recIdx[ptr - ptrShift] = rIdx;
  139.                     continue SCAN;
  140.                 }

  141.                 rIdx = recNext(rec);
  142.                 chainLen++;
  143.             }

  144.             if (chainLen == maxChainLength)
  145.                 return false;

  146.             // This is the first time we have ever seen this particular
  147.             // element in the sequence. Construct a new chain for it.
  148.             //
  149.             final int rIdx = ++recCnt;
  150.             if (rIdx == recs.length) {
  151.                 int sz = Math.min(recs.length << 1, 1 + region.getLengthA());
  152.                 long[] n = new long[sz];
  153.                 System.arraycopy(recs, 0, n, 0, recs.length);
  154.                 recs = n;
  155.             }

  156.             recs[rIdx] = recCreate(table[tIdx], ptr, 1);
  157.             recIdx[ptr - ptrShift] = rIdx;
  158.             table[tIdx] = rIdx;
  159.         }
  160.         return true;
  161.     }

  162.     private int tryLongestCommonSequence(int bPtr) {
  163.         int bNext = bPtr + 1;
  164.         int rIdx = table[hash(b, bPtr)];
  165.         for (long rec; rIdx != 0; rIdx = recNext(rec)) {
  166.             rec = recs[rIdx];

  167.             // If there are more occurrences in A, don't use this chain.
  168.             if (recCnt(rec) > cnt) {
  169.                 if (!hasCommon)
  170.                     hasCommon = cmp.equals(a, recPtr(rec), b, bPtr);
  171.                 continue;
  172.             }

  173.             int as = recPtr(rec);
  174.             if (!cmp.equals(a, as, b, bPtr))
  175.                 continue;

  176.             hasCommon = true;
  177.             TRY_LOCATIONS: for (;;) {
  178.                 int np = next[as - ptrShift];
  179.                 int bs = bPtr;
  180.                 int ae = as + 1;
  181.                 int be = bs + 1;
  182.                 int rc = recCnt(rec);

  183.                 while (region.beginA < as && region.beginB < bs
  184.                         && cmp.equals(a, as - 1, b, bs - 1)) {
  185.                     as--;
  186.                     bs--;
  187.                     if (1 < rc)
  188.                         rc = Math.min(rc, recCnt(recs[recIdx[as - ptrShift]]));
  189.                 }
  190.                 while (ae < region.endA && be < region.endB
  191.                         && cmp.equals(a, ae, b, be)) {
  192.                     if (1 < rc)
  193.                         rc = Math.min(rc, recCnt(recs[recIdx[ae - ptrShift]]));
  194.                     ae++;
  195.                     be++;
  196.                 }

  197.                 if (bNext < be)
  198.                     bNext = be;
  199.                 if (lcs.getLengthA() < ae - as || rc < cnt) {
  200.                     // If this region is the longest, or there are less
  201.                     // occurrences of it in A, its now our LCS.
  202.                     //
  203.                     lcs.beginA = as;
  204.                     lcs.beginB = bs;
  205.                     lcs.endA = ae;
  206.                     lcs.endB = be;
  207.                     cnt = rc;
  208.                 }

  209.                 // Because we added elements in reverse order index 0
  210.                 // cannot possibly be the next position. Its the first
  211.                 // element of the sequence and thus would have been the
  212.                 // value of as at the start of the TRY_LOCATIONS loop.
  213.                 //
  214.                 if (np == 0)
  215.                     break TRY_LOCATIONS;

  216.                 while (np < ae) {
  217.                     // The next location to consider was actually within
  218.                     // the LCS we examined above. Don't reconsider it.
  219.                     //
  220.                     np = next[np - ptrShift];
  221.                     if (np == 0)
  222.                         break TRY_LOCATIONS;
  223.                 }

  224.                 as = np;
  225.             }
  226.         }
  227.         return bNext;
  228.     }

  229.     private int hash(HashedSequence<S> s, int idx) {
  230.         return (cmp.hash(s, idx) * 0x9e370001 /* mix bits */) >>> keyShift;
  231.     }

  232.     private static long recCreate(int next, int ptr, int cnt) {
  233.         return ((long) next << REC_NEXT_SHIFT) //
  234.                 | ((long) ptr << REC_PTR_SHIFT) //
  235.                 | cnt;
  236.     }

  237.     private static int recNext(long rec) {
  238.         return (int) (rec >>> REC_NEXT_SHIFT);
  239.     }

  240.     private static int recPtr(long rec) {
  241.         return ((int) (rec >>> REC_PTR_SHIFT)) & REC_PTR_MASK;
  242.     }

  243.     private static int recCnt(long rec) {
  244.         return ((int) rec) & REC_CNT_MASK;
  245.     }

  246.     private static int tableBits(int sz) {
  247.         int bits = 31 - Integer.numberOfLeadingZeros(sz);
  248.         if (bits == 0)
  249.             bits = 1;
  250.         if (1 << bits < sz)
  251.             bits++;
  252.         return bits;
  253.     }
  254. }