nutch0.9中的摘要提取机制源码分析


/**
     * Low level api to get the most relevant (formatted) sections of the document.
     * This method has been made public to allow visibility of score information held in TextFragment objects.
     * Thanks to Jason Calabrese for help in redefining the interface.
     * 
@param tokenStream
     * 
@param text
     * 
@param maxNumFragments
     * 
@param mergeContiguousFragments
     * 
@throws IOException
     
*/

    
public final TextFragment[] getBestTextFragments(
        TokenStream tokenStream,
        String text,
        
boolean mergeContiguousFragments,
        
int maxNumFragments)
        
throws IOException
    
{
        ArrayList docFrags 
= new ArrayList();
        StringBuffer newText
=new StringBuffer();

        TextFragment currentFrag 
=    new TextFragment(newText,newText.length(), docFrags.size());
        fragmentScorer.startFragment(currentFrag);
        docFrags.add(currentFrag);

        FragmentQueue fragQueue 
= new FragmentQueue(maxNumFragments);

        
try
        
{
            org.apache.lucene.analysis.Token token;
            String tokenText;
            
int startOffset;
            
int endOffset;
            
int lastEndOffset = 0;
            textFragmenter.start(text);

            TokenGroup tokenGroup
=new TokenGroup();
            token 
= tokenStream.next();
            
while ((token!= null)&&(token.startOffset()<maxDocBytesToAnalyze))
            
{
                
if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct(token)))
                
{
                    
//the current token is distinct from previous tokens -
                    
// markup the cached token group info
                    startOffset = tokenGroup.matchStartOffset;
                    endOffset 
= tokenGroup.matchEndOffset;
                    tokenText 
= text.substring(startOffset, endOffset);
                    String markedUpText
=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
                    
//store any whitespace etc from between this and last group
                    if (startOffset > lastEndOffset)
                        newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
                    newText.append(markedUpText);
                    lastEndOffset
=Math.max(endOffset, lastEndOffset);
                    tokenGroup.clear();

                    
//check if current token marks the start of a new fragment
                    if(textFragmenter.isNewFragment(token))
                    
{
                        currentFrag.setScore(fragmentScorer.getFragmentScore());
                        
//record stats for a new fragment
                        currentFrag.textEndPos = newText.length();
                        currentFrag 
=new TextFragment(newText, newText.length(), docFrags.size());
                        fragmentScorer.startFragment(currentFrag);
                        docFrags.add(currentFrag);
                    }

                }


                tokenGroup.addToken(token,fragmentScorer.getTokenScore(token));

//                if(lastEndOffset>maxDocBytesToAnalyze)
//                {
//                    break;
//                }
                token = tokenStream.next();
            }

            currentFrag.setScore(fragmentScorer.getFragmentScore());

            
if(tokenGroup.numTokens>0)
            
{
                
//flush the accumulated text (same code as in above loop)
                startOffset = tokenGroup.matchStartOffset;
                endOffset 
= tokenGroup.matchEndOffset;
                tokenText 
= text.substring(startOffset, endOffset);
                String markedUpText
=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
                
//store any whitespace etc from between this and last group
                if (startOffset > lastEndOffset)
                    newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
                newText.append(markedUpText);
                lastEndOffset
=Math.max(lastEndOffset,endOffset);
            }


            
//Test what remains of the original text beyond the point where we stopped analyzing 
            if (
//                    if there is text beyond the last token considered..
                    (lastEndOffset < text.length()) 
                    
&&
//                    and that text is not too large...
                    (text.length()<maxDocBytesToAnalyze)
                )                
            
{
                
//append it to the last fragment
                newText.append(encoder.encodeText(text.substring(lastEndOffset)));
            }


            currentFrag.textEndPos 
= newText.length();

            
//sort the most relevant sections of the text
            for (Iterator i = docFrags.iterator(); i.hasNext();)
            
{
                currentFrag 
= (TextFragment) i.next();

                
//If you are running with a version of Lucene before 11th Sept 03
                
// you do not have PriorityQueue.insert() - so uncomment the code below
                /*
                                    if (currentFrag.getScore() >= minScore)
                                    {
                                        fragQueue.put(currentFrag);
                                        if (fragQueue.size() > maxNumFragments)
                                        { // if hit queue overfull
                                            fragQueue.pop(); // remove lowest in hit queue
                                            minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore
                                        }


                                    }
                
*/

                
//The above code caused a problem as a result of Christoph Goller's 11th Sept 03
                
//fix to PriorityQueue. The correct method to use here is the new "insert" method
                
// USE ABOVE CODE IF THIS DOES NOT COMPILE!
                fragQueue.insert(currentFrag);
            }


            
//return the most relevant fragments
            TextFragment frag[] = new TextFragment[fragQueue.size()];
            
for (int i = frag.length - 1; i >= 0; i--)
            
{
                frag[i] 
= (TextFragment) fragQueue.pop();
            }


            
//merge any contiguous fragments to improve readability
            if(mergeContiguousFragments)
            
{
                mergeContiguousFragments(frag);
                ArrayList fragTexts 
= new ArrayList();
                
for (int i = 0; i < frag.length; i++)
                
{
                    
if ((frag[i] != null&& (frag[i].getScore() > 0))
                    
{
                        fragTexts.add(frag[i]);
                    }

                }

                frag
= (TextFragment[]) fragTexts.toArray(new TextFragment[0]);
            }


            
return frag;

        }

        
finally
        
{
            
if (tokenStream != null)
            
{
                
try
                
{
                    tokenStream.close();
                }

                
catch (Exception e)
                
{
                }

            }

        }

    }

   
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值