diff -uNr abi-old/src/af/gr/xp/gr_Graphics.cpp abi-new/src/af/gr/xp/gr_Graphics.cpp --- abi-old/src/af/gr/xp/gr_Graphics.cpp 2005-03-05 05:30:33.000000000 +0000 +++ abi-new/src/af/gr/xp/gr_Graphics.cpp 2005-03-09 09:48:56.000000000 +0000 @@ -1095,23 +1095,35 @@ } /*! - return true if linebreak at character c is permissible - the built-in class is too simple to differentiate between breaks before and after character + * return true if linebreak after/before character c is permissible. */ -bool GR_Graphics::canBreak(GR_RenderInfo & ri, UT_sint32 &iNext, bool /* bAfter */) +bool GR_Graphics::canBreak(GR_RenderInfo & ri, UT_sint32 &iNext, bool bAfter) { + UT_return_val_if_fail(getApp(), false); + const class XAP_EncodingManager *EncMan=getApp()->getEncodingManager(); + UT_UCS4Char c[3]; + + UT_return_val_if_fail(EncMan, false); + iNext = -1; // we do not bother with this UT_return_val_if_fail(ri.m_pText && ri.m_pText->getStatus() == UTIter_OK, false); *(ri.m_pText) += ri.m_iOffset; UT_return_val_if_fail(ri.m_pText->getStatus() == UTIter_OK, false); - UT_UCS4Char c = ri.m_pText->getChar(); + /* + * For CJK we need to consider the characters either side as well. + */ + c[1] = ri.m_pText->getChar(); + --(*ri.m_pText); + c[0] = ri.m_pText->getChar(); + ++(*ri.m_pText); ++(*ri.m_pText); + c[2] = ri.m_pText->getChar(); - UT_return_val_if_fail(getApp(), false); - return getApp()->getEncodingManager()->can_break_at(c); + return EncMan->can_break_at(c, bAfter); } + /*! resetJustification() makes the data represented by ri unjustified and returns value by which the total width changed as a result such diff -uNr abi-old/src/af/xap/xp/xap_EncodingManager.cpp abi-new/src/af/xap/xp/xap_EncodingManager.cpp --- abi-old/src/af/xap/xp/xap_EncodingManager.cpp 2005-03-05 05:30:41.000000000 +0000 +++ abi-new/src/af/xap/xp/xap_EncodingManager.cpp 2005-03-09 09:49:09.369271800 +0000 @@ -536,6 +536,26 @@ return search_map_with_opt_suffix(m,fallback_key,fallback_key_final); } +/* + * Given a array "list" of type UT_UCS4Char this function returns true or false + * depending on whether "c" is found in the list. + * + * Note: For performance reasons this function assumes that the list entries + * are in NUMERICAL ORDER starting with the smallest value. + * For example, see cjkCantStartLine below. + */ +static bool isInList(UT_UCS4Char c, const UT_UCS4Char *list) +{ + int i=0; + + while ((c >= list[i]) && (list[i] != 0)) { + if (list[i]==c) + return true; + i++; + } + return false; +} + /* ************************* here begin tables *************************/ /* this array describes mapping form current encoding to Tex's encoding name. @@ -646,6 +666,7 @@ }; +#if 0 // This function is made obsolete by rules in gr_Graphics::canBreak() /* TODO I'm pretty sure you can't break Korean at any character. TODO And what about Japanese Katakana and Hiragana? @@ -656,6 +677,110 @@ {"1",cjk_languages}, {NULL} }; +#endif + +/* + * + * A list of characters that cannot appear at the beginning + * of a line. These must be in NUMERICAL ORDER. + * + * This is mainly for CJK languages in which we can break between + * any character. + */ +static const UT_UCS4Char cjkCantStartLine[]= +{ + 0x0021, // ASCII eclamation mark. + 0x0029, // ASCII right parenthesis + 0x002e, // ASCII full stop. + 0x002c, // ASCII colon. + 0x003b, // ASCII semicolon. + 0x003e, // ASCII greater-than sign. + 0x003f, // ASCII question mark. + 0x2019, // Right single quotation mark. (fullwidth?) + 0x201d, // Right double quotation marks. (fullwidth?) + 0x3001, // Ideographic comma. + 0x3002, // Ideographic full stop. + 0x300b, // Right double angle bracket. + 0xff01, // Fullwidth exclamation mark. + 0xff09, // Fullwidth right parenthesis. + 0xff0c, // Fullwidth comma. + 0xff1a, // Fullwidth colon. + 0xff1b, // Fullwidth semicolon. + 0xfe1e, // Fullwidth greater-than sign. + 0xff1f, // Fullwidth question mark. + 0xff3d, // Fullwidth right square bracket. + 0xff5d, // Fullwidth right curly bracket. + 0x0 +}; + +/* These must also be in NUMERICAL ORDER. */ +static const UT_UCS4Char cjkCantEndLine[]= +{ + 0x0028, // ASCII left parenthesis. + 0x003c, // ASCII less-than sign. + 0x201c, // Left double quotation mark. + 0x300a, // Left double angle bracket. + 0xff08, // Fullwidth left parenthesis. + 0xff1c, // Fullwifth less-than sign. + 0xff3b, // Fullwidth left square bracket. + 0xff5b, // Fullwidth left curly bracket. + 0x0 +}; + +/* + * A list of characters that, when in pairs, must not be split. + * These must also be in NUMERICAL ORDER. + */ +static const UT_UCS4Char cjkMustNotSplit[]= +{ + 0x2014, // EM Dash. + 0x2036 // Horizontal ellipsis. +}; + +static UT_UCSChar canBreakAfter[] = +{ + 0x0020, // UCS_SPACE + 0x0021, // ! + 0x0029, // ) + 0x002c, // , + 0x002d, // UCS_MINUS + 0x002e, // . + 0x003a, // : + 0x003b, // ; + 0x003e, // > + 0x003f, // ? + 0x005d, // ] + 0x007d, // } + 0x2010, // UCS_HYPHEN + 0x2013, // UCS_EN_DASH + 0x2014, // UCS_EM_DASH + 0x3002, // Ideographic full stop + 0x300b, // Right double angle bracket + 0xff01, // Fullwidth exclamation mark + 0xff09, // Fullwidth right parenthesis + 0xff0c, // Fullwidth comma + 0xff0d, // Fullwidth hypen-minus + 0xff1a, // Fullwidth colon + 0xff1b, // Fullwidth semi-colon + 0xff1f, // Fullwidth question mark + 0xff3d, // Fullwidth right square bracket + 0xff5d, // Fullwidth right curly bracket + 0x0 +}; + +static UT_UCSChar canBreakBefore[] = +{ + 0x0028, // ( + 0x003c, // < + 0x005b, // [ + 0x007b, // { + 0x300a, // Left double angle bracket + 0xff08, // Fullwidth left parenthesis + 0xff3b, // Fullwidth left square bracket + 0xff5b, // Fullwidth left curly bracket + 0x0 +}; + /* This table is useful since some iconv implementations don't know some cpNNNN @@ -961,6 +1086,39 @@ /* ************************* here end tables *************************/ +/* + * Given a character returns a structure which specifies: + * + * 1, If this is a CJK character or not. + * 2, If it is forbidden from ending a line. + * 3, If it is forbidden from starting a line. + * 4. If pairs of this character must to kept together and not broken + * accross lines. + */ +struct SCJKProp XAP_EncodingManager::charCJKProp(UT_UCS4Char c) const +{ + struct SCJKProp prop = {false, false, false, false}; + + // Does this fall into any of the ranges for CJK characters? + if ((c>=0x20000 && c<=0x2a6df) || (c>=0x2f800 && c<=0x2fa1f) || + (c>=0x3000 && c<=0x303f) || (c>=0x3200 && c<=0x32ff) || + (c>=0x3300 && c<=0x33ff) || (c>=0x3400 && c<=0x4dbf) || + (c>=0x4e00 && c<=0x9faf) || (c>=0xf900 && c<=0xfaff) || + (c>=0xfe30 && c<=0xfe4f)) + prop.cjk = true; + + if (isInList(c, cjkCantStartLine)) + prop.cantStartLine = true; + + if (isInList(c, cjkCantEndLine)) + prop.cantEndLine = true; + + if (isInList(c, cjkMustNotSplit)) + prop.mustNotSplit = true; + + return prop; +} + const XAP_LangInfo* XAP_EncodingManager::findLangInfo(const char* key,XAP_LangInfo::fieldidx idx) { if (idx > XAP_LangInfo::max_idx) @@ -1111,8 +1269,11 @@ { const char* str = search_rmap_with_opt_suffix(langcode_to_cjk,SEARCH_PARAMS); is_cjk_ = *str == '1'; - str = search_rmap_with_opt_suffix(can_break_words_data,SEARCH_PARAMS); - can_break_words_ = *str == '1'; +// This is made obsolete by new CJK line break handling in +// gr_Graphics::canBreak. +// +// str = search_rmap_with_opt_suffix(can_break_words_data,SEARCH_PARAMS); +// can_break_words_ = *str == '1'; } { if (cjk_locale()) { @@ -1160,10 +1321,13 @@ int XAP_EncodingManager__swap_stou,XAP_EncodingManager__swap_utos; + +#if 0 // This is made obsolete by new CJK line breaking in gr_Graphics::canBreak bool XAP_EncodingManager::can_break_words() const { return can_break_words_; } +#endif /* I'm not sure whether any non-cjk language doesn't make distinction @@ -1172,38 +1336,117 @@ */ bool XAP_EncodingManager::single_case() const { return cjk_locale(); } +#if 0 // This function is now performed by XAP_EncodingManager::char_cjk_prop bool XAP_EncodingManager::is_cjk_letter(UT_UCSChar c) const { if (!cjk_locale()) return 0; return (c>0xff); } +#endif bool XAP_EncodingManager::noncjk_letters(const UT_UCSChar* str,int len) const { if (!cjk_locale()) return 1; for(int i=0;i>}") and certain others cannot end a line + * (eg. "(<<{"). Also, a few characters like "--" must occur in pairs + * which cannot be split. This function returns true or false in such a + * way as to ensure these rules. The characters affected are defined + * in the table in xap_EncodingManager.cpp + * + * c[] should be an array of three characters. c[1] is the character at + * which we're being asked to break. c[0] is the char before and c[2] the + * char after. + * + */ +bool XAP_EncodingManager::can_break_at(const UT_UCSChar c[], bool bAfter) const +{ + struct SCJKProp cjkProp[3]; int i; + + // Get the CJK properties of the three chars. + for (i=0; i<3; i++) + cjkProp[i] = charCJKProp(c[i]); + +#ifdef UT_DEBUG + // This is just for debugging + const UT_uint32 iSize=256; + UT_uint32 iNumChars=0; + char debugbuf[iSize]; + iNumChars +=snprintf(debugbuf, iSize, "Break %s ", bAfter?"after":"before"); + for (i=0; i<3; i++) { + iNumChars += snprintf(debugbuf+iNumChars, iSize-iNumChars, + "c%d: %x(%s%s%s%s) ", i, c[i], + cjkProp[i].cjk?"C":".", + cjkProp[i].cantStartLine?"S":".", + cjkProp[i].cantEndLine?"E":".", + cjkProp[i].mustNotSplit?"N":"."); + } + snprintf(debugbuf+iNumChars, iSize-iNumChars, "\n"); + UT_DEBUGMSG(("%s",debugbuf)); +#endif + + if (bAfter) // We're being asked about breaking after. + { + if (cjkProp[1].mustNotSplit && c[1]==c[2]) + return false; + if (cjkProp[1].cantEndLine || !cjkProp[1].cjk) + return westernCanBreakAt(c[1], bAfter); + return !cjkProp[2].cantStartLine; + } + else // We're being asked about breaking before. + { + if (cjkProp[1].mustNotSplit && c[1]==c[0]) + return false; + if (cjkProp[1].cantStartLine || !cjkProp[1].cjk) + return westernCanBreakAt(c[1], bAfter); + return !cjkProp[1].cantEndLine; + } + UT_ASSERT(UT_SHOULD_NOT_HAPPEN); } @@ -1267,7 +1510,10 @@ "--->8--------------\n" " WinLanguageCode is 0x%04x, WinCharsetCode is %d\n" - " cjk_locale %d, can_break_words %d, swap_utos %d, swap_stou %d\n", + " cjk_locale %d, " + // "can_break_words %d, " Made obsolete by new CJK line breaking in + // gr_Graphics:canBreak + "swap_utos %d, swap_stou %d\n", getLanguageISOName(), getLanguageISOTerritory() ? getLanguageISOTerritory() : "NULL", getNativeEncodingName(),getNativeSystemEncodingName(), getNative8BitEncodingName(),getNativeNonUnicodeEncodingName(), @@ -1275,7 +1521,7 @@ fallbackChar(1072), getTexPrologue(), getWinLanguageCode(), getWinCharsetCode(), - int(cjk_locale()), int(can_break_words()),int(swap_utos),int(swap_stou) + int(cjk_locale()), /*int(can_break_words()),*/ int(swap_utos),int(swap_stou) )); UT_ASSERT( UT_iconv_isValid(iconv_handle_N2U) && UT_iconv_isValid(iconv_handle_U2N) ); } diff -uNr abi-old/src/af/xap/xp/xap_EncodingManager.h abi-new/src/af/xap/xp/xap_EncodingManager.h --- abi-old/src/af/xap/xp/xap_EncodingManager.h 2005-03-05 05:30:41.000000000 +0000 +++ abi-new/src/af/xap/xp/xap_EncodingManager.h 2005-03-09 09:49:09.385269368 +0000 @@ -38,6 +38,17 @@ #include "ut_iconv.h" #include "ut_xml.h" +/* + * CJK character properties struct. + */ +struct SCJKProp { + bool cjk; + bool cantEndLine; + bool cantStartLine; + bool mustNotSplit; +}; + + struct ABI_EXPORT XAP_LangInfo { /*no memeber can have NULL value. If string is empty, then value is @@ -169,7 +180,7 @@ /* whether words can be broken at any character of the word (wide character, not byte). True for japanese. */ - virtual bool can_break_words() const; +// virtual bool can_break_words() const; /* returns true if there is no distinction between upper and lower @@ -189,12 +200,18 @@ Under CJK locales it returns 1 for cjk letters. Under non-CJK locales returns 0. */ - virtual bool can_break_at(const UT_UCSChar c) const; + virtual bool can_break_at(const UT_UCSChar c[], bool bAfter) const; /* This should be as precise as possible. */ - virtual bool is_cjk_letter(UT_UCSChar c) const; +// virtual bool is_cjk_letter(UT_UCSChar c) const; + + /* + * Returns a value that indicates if c is a CJK character and, if so, + * what it's line breaking properties are. + */ + virtual struct SCJKProp charCJKProp(UT_UCS4Char c) const; /* This is rather smart wrapper for wvLIDToCodePageConverter. diff -uNr abi-old/src/text/fmt/xp/fp_TextRun.cpp abi-new/src/text/fmt/xp/fp_TextRun.cpp --- abi-old/src/text/fmt/xp/fp_TextRun.cpp 2005-03-05 05:30:49.000000000 +0000 +++ abi-new/src/text/fmt/xp/fp_TextRun.cpp 2005-03-09 09:49:16.001263584 +0000 @@ -355,7 +355,7 @@ PD_StruxIterator text(getBlock()->getStruxDocHandle(), getBlockOffset() + fl_BLOCK_STRUX_OFFSET); UT_return_val_if_fail(text.getStatus() == UTIter_OK, false); - text.setUpperLimit(text.getPosition() + getLength() - 1); + //text.setUpperLimit(text.getPosition() + getLength() - 1); UT_return_val_if_fail(m_pRenderInfo, false); m_pRenderInfo->m_pText = &text; @@ -389,7 +389,7 @@ getBlockOffset() + fl_BLOCK_STRUX_OFFSET ); UT_return_val_if_fail(text.getStatus() == UTIter_OK, false); - text.setUpperLimit(text.getPosition() + getLength() - 1); +// text.setUpperLimit(text.getPosition() + getLength() - 1); UT_return_val_if_fail(m_pRenderInfo, false); m_pRenderInfo->m_pText = &text; @@ -532,7 +532,7 @@ offset + fl_BLOCK_STRUX_OFFSET); m_pRenderInfo->m_pText = &text; - text.setUpperLimit(text.getPosition() + getLength() - 1); +// text.setUpperLimit(text.getPosition() + getLength() - 1); UT_uint32 iPosStart = text.getPosition(); //bool bReverse = (getVisDirection() == UT_BIDI_RTL); @@ -564,8 +564,8 @@ text.setPosition(iPos); } - if (bForce || iNext == (UT_sint32)i || bCanBreak) - // && ((i + offset) != (getBlockOffset() + getLength() - 1)) + if (bForce || iNext == (UT_sint32)i || bCanBreak + && ((i + offset) != (getBlockOffset() + getLength() - 1)))// KAY: Enabled { UT_sint32 ispace = 0; if(c == UCS_SPACE) diff -uNr abi-old/src/text/fmt/xp/fv_View.cpp abi-new/src/text/fmt/xp/fv_View.cpp --- abi-old/src/text/fmt/xp/fv_View.cpp 2005-03-05 05:30:49.000000000 +0000 +++ abi-new/src/text/fmt/xp/fv_View.cpp 2005-03-09 09:49:16.205232576 +0000 @@ -9856,8 +9856,10 @@ // CJK-FIXME: this can work incorrectly under CJK locales // since it can give 'true' for UCS with value >0xff (like // quotes, etc). + // New function might be better. - R.Kay if (newWord || - XAP_EncodingManager::get_instance()->is_cjk_letter(pSpan[i])) +// XAP_EncodingManager::get_instance()->is_cjk_letter(pSpan[i])) + (XAP_EncodingManager::get_instance()->charCJKProp(pSpan[i]).cjk)) { wCount.word++;