Tuesday, April 15, 2008

Creating a Summary from a MarkLogic Search Result

It's called a summary, or a snippet, or context. It's the string beneath each search result that shows you some words around your search term(s) in the document that was returned.

There's a good one in lib-search if you're using it. I'm not ... yet. At first I tried to use just the relevant functions, but it wasn't doing quite what I wanted and it seemed pretty heavy, especially returning 25 documents per page. The additional things I wanted it to do were to allow me to ignore certain elements and to cross element boundaries. So, even though I'm an XQuery and MarkLogic rookie I decide to try and roll my own!

module "http://greenwood.com"
default function namespace="http://www.w3.org/2003/05/xpath-functions"
declare namespace gpg="http://greenwood.com"

(: Take a search result and create a snippet of text based on the first hit
in the file. Exclude selected elements when generating the snippet. If the
hit is in an element that is removed, it will use the next available hit
or default to the first string of words available. Element boundaries are
ignored, which is a perceived benefit. :)

define variable $gpg:START-TEXT as xs:string { "ML-HIT-START" }
define variable $gpg:END-TEXT as xs:string { "ML-HIT-END" }

define function gpg:get-summary($node as node(), $cts-query as cts:query, $word-buffer as xs:integer) as node()
{
let $myHighlight as node() := cts:highlight( $node, $cts-query, ($gpg:START-TEXT, $cts:text, $gpg:END-TEXT) )
let $mySummary as node() := <summary> { gpg:remove-elements($myHighlight) } </summary>
let $mySnippet as xs:string := gpg:create-snippet($mySummary, $word-buffer)
(: Yes, we are running cts:highlight twice. The advantage is that it greatly simplifies
the logic for getting the snippet text and has minimal impact on performance when
compared to that alternative. It's the lesser of two evils. :)
return
cts:highlight(<summary> { $mySnippet } </summary>, $cts-query, <span class="hit"> { $cts:text } </span> )
}

define function gpg:create-snippet($node as node(), $word-buffer as xs:integer) as xs:string
{
let $myString := normalize-space(string($node))
let $myTokenizedString := tokenize($myString, "\s")
(: If the sequence contains the start of the search indicator use it, else use 1. :)
(: index-of() can return a sequence of hits, so just grab the first. :)
let $myStartHit := if(index-of($myTokenizedString, $gpg:START-TEXT)[1] castable as xs:integer) then
index-of($myTokenizedString, $gpg:START-TEXT)[1]
else 1
(: If starting the buffer's number of words before the hit is a negative number,
start at 1, otherwise start at the first hit minus the buffer. :)
let $myStart := if( ($myStartHit - $word-buffer) < 0 ) then 1 else ($myStartHit - $word-buffer)
let $myEnd := $word-buffer*2
(: Subsequence does not really care if you feed it negative numbers or numbers that
extend beyond the source sequence's actual size, which is very useful here.
Negative numbers can have odd results, though. :)
let $myTokenizedStringSmall := subsequence($myTokenizedString, $myStart, $myEnd)
(: Join the sequence back together as a string with spaces between each item. :)
let $myUneditedString := string-join($myTokenizedStringSmall, " ")
(: Delete the placeholder text completely. :)
let $myEditedStringStart := replace($myUneditedString, $gpg:START-TEXT, '')
let $myEditedStringEnd := replace($myEditedStringStart, $gpg:END-TEXT, '')
(: When this is returned, run cts:highlight on it to get highlighting in the snippet.
Or don't if it's not needed. :)
return $myEditedStringEnd
}

(: This group of elements is used to remove selected nodes recursively. This means we can
remove hits on head or metadata elements, which might look odd in a snippet. :)
define function gpg:remove-elements($node as node()) as node()
{
for $i in $node/node() return gpg:removal($i)
}
(: This function removes nodes or pass them to the correct handler for processing. :)
define function gpg:removal($node as node()) as node()
{
typeswitch($node)
case text() return gpg:text-handler($node)
case element(content-metadata) return () (: This is one that is removed. :)
case element(head) return ()
case element(entry-head) return ()
case element(taxonomy) return ()
case processing-instruction() return ()
default return gpg:default-handler($node) (: The default is to return the node and recurse. :)
}
define function gpg:text-handler($node as node()?) as node()*
{
if(empty($node)) then ()
else (text {$node})
}
define function gpg:default-handler($node as node()?) as element()*
{
element { local-name($node) }
{ $node/@*, gpg:remove-elements($node) }
}

If you're reading this on a narrower screen resolution, you may be loosing the right-hand side of the code. Copy and paste it out to see it better.

UPDATE: This is painfully slow, primarily I think because my documents are too large to process like this. I'm working to make this leaner.

2 comments:

Vijay said...

Hi,

I am a beginner in Marklogic, so could you please explain me more about it.

Mattio Valentino said...

Hi Vijay,

The best advice I can give is to read through their documentation, which you can find on their developer site.

The "Developer's Guide" and the "Introduction to XQuery" are good places to start.

Be sure setup their tool called "cq." While you're reading, load a few documents and try some queries through in cq to get the hang of things.