Sql-Server

計算表中單詞的出現速度很慢

  • August 29, 2019

考慮這些簡化的表格:

CREATE TABLE dbo.words
(
   id bigint NOT NULL IDENTITY (1, 1),
   word varchar(32) NOT NULL,
   hits int NULL
)

CREATE TABLE dbo.items
(
   id bigint NOT NULL IDENTITY (1, 1),
   body varchar(256) NOT NULL,
)

words表包含大約 9000 條記錄,每條記錄包含一個單詞(‘phone’、‘sofa’、‘house’、‘dog’、…) 該items表包含大約 12000 條記錄,每條記錄的正文不超過 256人物。

現在,我需要更新words表,計算表中有多少記錄items保存(至少一次)單詞欄位中的文本。我需要考慮部分單詞,因此所有這 4 條記錄都應計入單詞dog

'This is my dog'  
'I really like the movie dogma'  
'my cousin has sheepdogs'  
'dog dog dog doggerdy dog dog'

最後一個範例應僅計為一條記錄(至少包含一次“狗”一詞)。

我可以使用這個查詢:

UPDATE dbo.words
SET hits = (SELECT COUNT(*) FROM dbo.items WHERE body like '%' + word + '%')

但是,這非常慢,這將需要 10 多分鐘才能在我擁有的不太重的伺服器上完成。

AFAIK 索引無濟於事,我正在做 LIKE 搜尋。我也認為全文對我沒有幫助,因為我正在尋找開始、結束或包含我的搜尋詞的單詞。我在這裡可能是錯的。

關於如何加快速度的任何建議?

我發現加快領先萬用字元LIKE搜尋的最佳方法是使用 n-gram。我在 SQL Server中的 Trigram Wildcard String Search 中描述了該技術並提供了一個範例實現。

trigram 搜尋的基本思想非常簡單:

  1. 持久化目標數據的三字元子串(三元組)。
  2. 將搜尋詞拆分為三元組。
  3. 將搜尋三元組與儲存的三元組匹配(等式搜尋)。
  4. 與符合條件的行相交以查找匹配所有三元組的字元串。
  5. 將原始搜尋過濾器應用於大大減少的交叉點。

它可能適合您的需求,但請注意:

Trigram 搜尋不是靈丹妙藥。額外的儲存需求、實現複雜性以及對更新性能的影響都對其產生了很大影響。

測試

我使用莎士比亞全集執行了一個快速測試,用 15,838 行填充表格的body列。items我在words表格中載入了來自同一文本的 7,669 個唯一單詞。

在我的中檔筆記型電腦上,三元組結構在大約 2 秒內建構,以下更新語句在5 秒內完成:

UPDATE dbo.words WITH (TABLOCK)
SET hits = 
(
   SELECT COUNT_BIG(*) 
   FROM dbo.Items_TrigramSearch
       ('%' + word +'%') AS ITS
);

更新的單詞表的選擇:

樣本

我的文章中修改後的三元組腳本如下:

CREATE FUNCTION dbo.GenerateTrigrams (@string varchar(255))
RETURNS table
WITH SCHEMABINDING
AS RETURN
   WITH
       N16 AS 
       (
           SELECT V.v 
           FROM 
           (
               VALUES 
                   (0),(0),(0),(0),(0),(0),(0),(0),
                   (0),(0),(0),(0),(0),(0),(0),(0)
           ) AS V (v)),
       -- Numbers table (256)
       Nums AS 
       (
           SELECT n = ROW_NUMBER() OVER (ORDER BY A.v)
           FROM N16 AS A 
           CROSS JOIN N16 AS B
       ),
       Trigrams AS
       (
           -- Every 3-character substring
           SELECT TOP (CASE WHEN LEN(@string) > 2 THEN LEN(@string) - 2 ELSE 0 END)
               trigram = SUBSTRING(@string, N.n, 3)
           FROM Nums AS N
           ORDER BY N.n
       )
   -- Remove duplicates and ensure all three characters are alphanumeric
   SELECT DISTINCT 
       T.trigram
   FROM Trigrams AS T
   WHERE
       -- Binary collation comparison so ranges work as expected
       T.trigram COLLATE Latin1_General_BIN2 NOT LIKE '%[^A-Z0-9a-z]%';
GO
-- Trigrams for items table
CREATE TABLE dbo.ItemsTrigrams
(
   id integer NOT NULL,
   trigram char(3) NOT NULL
);
GO
-- Generate trigrams
INSERT dbo.ItemsTrigrams WITH (TABLOCKX)
   (id, trigram)
SELECT
   E.id,
   GT.trigram
FROM dbo.items AS E
CROSS APPLY dbo.GenerateTrigrams(E.body) AS GT;
GO
-- Trigram search index
CREATE UNIQUE CLUSTERED INDEX
   [CUQ dbo.ItemsTrigrams (trigram, id)]
ON dbo.ItemsTrigrams (trigram, id)
WITH (DATA_COMPRESSION = ROW);
GO
-- Selectivity of each trigram (performance optimization)
CREATE OR ALTER VIEW dbo.ItemsTrigramCounts
WITH SCHEMABINDING
AS
SELECT ET.trigram, cnt = COUNT_BIG(*)
FROM dbo.ItemsTrigrams AS ET
GROUP BY ET.trigram;
GO
-- Materialize the view
CREATE UNIQUE CLUSTERED INDEX
   [CUQ dbo.ItemsTrigramCounts (trigram)]
ON dbo.ItemsTrigramCounts (trigram);
GO
-- Most selective trigrams for a search string
-- Always returns a row (NULLs if no trigrams found)
CREATE FUNCTION dbo.Items_GetBestTrigrams (@string varchar(255))
RETURNS table
WITH SCHEMABINDING AS
RETURN
   SELECT
       -- Pivot
       trigram1 = MAX(CASE WHEN BT.rn = 1 THEN BT.trigram END),
       trigram2 = MAX(CASE WHEN BT.rn = 2 THEN BT.trigram END),
       trigram3 = MAX(CASE WHEN BT.rn = 3 THEN BT.trigram END)
   FROM 
   (
       -- Generate trigrams for the search string
       -- and choose the most selective three
       SELECT TOP (3)
           rn = ROW_NUMBER() OVER (
               ORDER BY ETC.cnt ASC),
           GT.trigram
       FROM dbo.GenerateTrigrams(@string) AS GT
       JOIN dbo.ItemsTrigramCounts AS ETC
           WITH (NOEXPAND)
           ON ETC.trigram = GT.trigram
       ORDER BY
           ETC.cnt ASC
   ) AS BT;
GO
-- Returns Example ids matching all provided (non-null) trigrams
CREATE FUNCTION dbo.Items_GetTrigramMatchIDs
(
   @Trigram1 char(3),
   @Trigram2 char(3),
   @Trigram3 char(3)
)
RETURNS @IDs table (id integer PRIMARY KEY)
WITH SCHEMABINDING AS
BEGIN
   IF  @Trigram1 IS NOT NULL
   BEGIN
       IF @Trigram2 IS NOT NULL
       BEGIN
           IF @Trigram3 IS NOT NULL
           BEGIN
               -- 3 trigrams available
               INSERT @IDs (id)
               SELECT ET1.id
               FROM dbo.ItemsTrigrams AS ET1 
               WHERE ET1.trigram = @Trigram1
               INTERSECT
               SELECT ET2.id
               FROM dbo.ItemsTrigrams AS ET2
               WHERE ET2.trigram = @Trigram2
               INTERSECT
               SELECT ET3.id
               FROM dbo.ItemsTrigrams AS ET3
               WHERE ET3.trigram = @Trigram3
               OPTION (MERGE JOIN);
           END;
           ELSE
           BEGIN
               -- 2 trigrams available
               INSERT @IDs (id)
               SELECT ET1.id
               FROM dbo.ItemsTrigrams AS ET1 
               WHERE ET1.trigram = @Trigram1
               INTERSECT
               SELECT ET2.id
               FROM dbo.ItemsTrigrams AS ET2
               WHERE ET2.trigram = @Trigram2
               OPTION (MERGE JOIN);
           END;
       END;
       ELSE
       BEGIN
           -- 1 trigram available
           INSERT @IDs (id)
           SELECT ET1.id
           FROM dbo.ItemsTrigrams AS ET1 
           WHERE ET1.trigram = @Trigram1;
       END;
   END;

   RETURN;
END;
GO
-- Search implementation
CREATE FUNCTION dbo.Items_TrigramSearch
(
   @Search varchar(255)
)
RETURNS table
WITH SCHEMABINDING
AS
RETURN
   SELECT
       Result.body
   FROM dbo.Items_GetBestTrigrams(@Search) AS GBT
   CROSS APPLY
   (
       -- Trigram search
       SELECT
           E.id,
           E.body
       FROM dbo.Items_GetTrigramMatchIDs
           (GBT.trigram1, GBT.trigram2, GBT.trigram3) AS MID
       JOIN dbo.Items AS E
           ON E.id = MID.id
       WHERE
           -- At least one trigram found 
           GBT.trigram1 IS NOT NULL
           AND E.body LIKE @Search

       UNION ALL

       -- Non-trigram search
       SELECT
           E.id,
           E.body
       FROM dbo.Items AS E
       WHERE
           -- No trigram found 
           GBT.trigram1 IS NULL
           AND E.body LIKE @Search
   ) AS Result;

唯一的其他更改是向items表中添加聚集索引:

CREATE UNIQUE CLUSTERED INDEX cuq ON dbo.items (id);

你確定你需要它更快嗎?您在 10 分鐘後取消了查詢,但您實際上無法判斷進度。如果取消時查詢已完成 90%+ 怎麼辦?您真正需要查詢的速度有多快?您多久會執行一次這樣的更新?

我問這些問題是因為我可以UPDATEMAXDOP 1. 該查詢也非常適合查詢並行性。當我強制查詢MAXDOP 8在我的機器上執行時,它會在 20 秒內完成。

請注意,排序規則在這裡可能很重要。上面的數字與排序規則有關SQL_Latin1_General_CP1_CS_AS。如果我將列排序規則更改為,Latin1_General_CI_AS那麼程式碼會慢八倍左右。此外,也許我的測試數據和硬體與你的有很大不同。我仍然建議估計查詢的總執行時間,然後決定是否需要嘗試更奇特的解決方案。您可以通過創建一個包含 1% 行的臨時表dbo.words並查看UPDATE較小表所需的時間來做到這一點。如果將查詢執行時間乘以 100,這應該是對真實事物的一個相當好的估計。

在下面的程式碼中,我使用CHARINDEX了代替,LIKE因為它在僅檢查另一個字元串中是否出現字元串時更快。如果需要,UPDATE可以鼓勵查詢與未記錄的使用提示並行執行ENABLE_PARALLEL_PLAN_PREFERENCE。這是查詢:

UPDATE #words
SET hits = (SELECT COUNT(*) FROM #items WHERE CHARINDEX(word, body) > 0)
OPTION (MAXDOP 1);

測試數據:

CREATE TABLE #items
(
   body varchar(256) NOT NULL
)

INSERT INTO #items WITH (TABLOCK)
SELECT TOP (12000) text
FROM sys.messages
WHERE LEN(text) <= 256
AND CAST(text AS VARCHAR(256)) = CAST(text AS NVARCHAR(256))
ORDER BY LEN(text) DESC;

CREATE TABLE #words
(
   id bigint NOT NULL IDENTITY (1, 1),
   word varchar(32) NOT NULL,
   hits int NULL,
   PRIMARY KEY (id)
)

INSERT INTO #words (word, hits)
SELECT DISTINCT TOP (9000)  LEFT(word, 32), NULL
FROM (
   SELECT LEFT(body, CHARINDEX(' ', body)) word
   FROM #items

   UNION ALL

   SELECT LEFT(body, -1 + CHARINDEX(' ', body)) a
   FROM #items

   UNION ALL

   SELECT RIGHT(body, CHARINDEX(' ', REVERSE(body)))
   FROM #items

   UNION ALL

   SELECT RIGHT(body, -1 + CHARINDEX(' ', REVERSE(body)))
   FROM #items
) q;

引用自:https://dba.stackexchange.com/questions/241185